libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if CUDA_VERSION < 6000
  53 extern CUresult cuGetErrorString (CUresult, const char **);
  54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  55 #endif
  56
  57 #if CUDA_VERSION >= 6050
  58 #undef cuLinkCreate
  59 #undef cuLinkAddData
  60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  61                         const char *, unsigned, CUjit_option *, void **);
  62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  63 #else
  64 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  65                            const char *, unsigned, CUjit_option *, void **);
  66 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  67 #endif
  68
  69 #define DO_PRAGMA(x) _Pragma (#x)
  70
  71 #if PLUGIN_NVPTX_DYNAMIC
  72 # include <dlfcn.h>
  73
  74 struct cuda_lib_s {
  75
  76 # define CUDA_ONE_CALL(call)                    \
  77   __typeof (call) *call;
  78 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  79   CUDA_ONE_CALL (call)
  80 #include "cuda-lib.def"
  81 # undef CUDA_ONE_CALL
  82 # undef CUDA_ONE_CALL_MAYBE_NULL
  83
  84 } cuda_lib;
  85
  86 /* -1 if init_cuda_lib has not been called yet, false
  87    if it has been and failed, true if it has been and succeeded.  */
  88 static signed char cuda_lib_inited = -1;
  89
  90 /* Dynamically load the CUDA runtime library and initialize function
  91    pointers, return false if unsuccessful, true if successful.  */
  92 static bool
  93 init_cuda_lib (void)
  94 {
  95   if (cuda_lib_inited != -1)
  96     return cuda_lib_inited;
  97   const char *cuda_runtime_lib = "libcuda.so.1";
  98   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
  99   cuda_lib_inited = false;
 100   if (h == NULL)
 101     return false;
 102
 103 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 104 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 105 # define CUDA_ONE_CALL_1(call, allow_null)              \
 106   cuda_lib.call = dlsym (h, #call);     \
 107   if (!allow_null && cuda_lib.call == NULL)             \
 108     return false;
 109 #include "cuda-lib.def"
 110 # undef CUDA_ONE_CALL
 111 # undef CUDA_ONE_CALL_1
 112 # undef CUDA_ONE_CALL_MAYBE_NULL
 113
 114   cuda_lib_inited = true;
 115   return true;
 116 }
 117 # define CUDA_CALL_PREFIX cuda_lib.
 118 #else
 119
 120 # define CUDA_ONE_CALL(call)
 121 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 122 #include "cuda-lib.def"
 123 #undef CUDA_ONE_CALL_MAYBE_NULL
 124 #undef CUDA_ONE_CALL
 125
 126 # define CUDA_CALL_PREFIX
 127 # define init_cuda_lib() true
 128 #endif
 129
 130 #include "secure_getenv.h"
 131
 132 #undef MIN
 133 #undef MAX
 134 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 135 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 136
 137 /* Convenience macros for the frequently used CUDA library call and
 138    error handling sequence as well as CUDA library calls that
 139    do the error checking themselves or don't do it at all.  */
 140
 141 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 142   do {                                          \
 143     unsigned __r                                \
 144       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 145     if (__r != CUDA_SUCCESS)                    \
 146       {                                         \
 147         GOMP_PLUGIN_error (#FN " error: %s",    \
 148                            cuda_error (__r));   \
 149         return ERET;                            \
 150       }                                         \
 151   } while (0)
 152
 153 #define CUDA_CALL(FN, ...)                      \
 154   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 155
 156 #define CUDA_CALL_ASSERT(FN, ...)               \
 157   do {                                          \
 158     unsigned __r                                \
 159       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 160     if (__r != CUDA_SUCCESS)                    \
 161       {                                         \
 162         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 163                            cuda_error (__r));   \
 164       }                                         \
 165   } while (0)
 166
 167 #define CUDA_CALL_NOCHECK(FN, ...)              \
 168   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 169
 170 #define CUDA_CALL_EXISTS(FN)                    \
 171   CUDA_CALL_PREFIX FN
 172
 173 static const char *
 174 cuda_error (CUresult r)
 175 {
 176   const char *fallback = "unknown cuda error";
 177   const char *desc;
 178
 179   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 180     return fallback;
 181
 182   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 183   if (r == CUDA_SUCCESS)
 184     return desc;
 185
 186   return fallback;
 187 }
 188
 189 static unsigned int instantiated_devices = 0;
 190 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 191
 192 struct ptx_stream
 193 {
 194   CUstream stream;
 195   pthread_t host_thread;
 196   bool multithreaded;
 197
 198   CUdeviceptr d;
 199   void *h;
 200   void *h_begin;
 201   void *h_end;
 202   void *h_next;
 203   void *h_prev;
 204   void *h_tail;
 205
 206   struct ptx_stream *next;
 207 };
 208
 209 /* Thread-specific data for PTX.  */
 210
 211 struct nvptx_thread
 212 {
 213   struct ptx_stream *current_stream;
 214   struct ptx_device *ptx_dev;
 215 };
 216
 217 static bool
 218 map_init (struct ptx_stream *s)
 219 {
 220   int size = getpagesize ();
 221
 222   assert (s);
 223   assert (!s->d);
 224   assert (!s->h);
 225
 226   CUDA_CALL (cuMemAllocHost, &s->h, size);
 227   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 228
 229   assert (s->h);
 230
 231   s->h_begin = s->h;
 232   s->h_end = s->h_begin + size;
 233   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 234
 235   assert (s->h_next);
 236   assert (s->h_end);
 237   return true;
 238 }
 239
 240 static bool
 241 map_fini (struct ptx_stream *s)
 242 {
 243   CUDA_CALL (cuMemFreeHost, s->h);
 244   return true;
 245 }
 246
 247 static void
 248 map_pop (struct ptx_stream *s)
 249 {
 250   assert (s != NULL);
 251   assert (s->h_next);
 252   assert (s->h_prev);
 253   assert (s->h_tail);
 254
 255   s->h_tail = s->h_next;
 256
 257   if (s->h_tail >= s->h_end)
 258     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 259
 260   if (s->h_next == s->h_tail)
 261     s->h_prev = s->h_next;
 262
 263   assert (s->h_next >= s->h_begin);
 264   assert (s->h_tail >= s->h_begin);
 265   assert (s->h_prev >= s->h_begin);
 266
 267   assert (s->h_next <= s->h_end);
 268   assert (s->h_tail <= s->h_end);
 269   assert (s->h_prev <= s->h_end);
 270 }
 271
 272 static void
 273 map_push (struct ptx_stream *s, size_t size, void **h, void **d)
 274 {
 275   int left;
 276   int offset;
 277
 278   assert (s != NULL);
 279
 280   left = s->h_end - s->h_next;
 281
 282   assert (s->h_prev);
 283   assert (s->h_next);
 284
 285   if (size >= left)
 286     {
 287       assert (s->h_next == s->h_prev);
 288       s->h_next = s->h_prev = s->h_tail = s->h_begin;
 289     }
 290
 291   assert (s->h_next);
 292
 293   offset = s->h_next - s->h;
 294
 295   *d = (void *)(s->d + offset);
 296   *h = (void *)(s->h + offset);
 297
 298   s->h_prev = s->h_next;
 299   s->h_next += size;
 300
 301   assert (s->h_prev);
 302   assert (s->h_next);
 303
 304   assert (s->h_next >= s->h_begin);
 305   assert (s->h_tail >= s->h_begin);
 306   assert (s->h_prev >= s->h_begin);
 307   assert (s->h_next <= s->h_end);
 308   assert (s->h_tail <= s->h_end);
 309   assert (s->h_prev <= s->h_end);
 310
 311   return;
 312 }
 313
 314 /* Target data function launch information.  */
 315
 316 struct targ_fn_launch
 317 {
 318   const char *fn;
 319   unsigned short dim[GOMP_DIM_MAX];
 320 };
 321
 322 /* Target PTX object information.  */
 323
 324 struct targ_ptx_obj
 325 {
 326   const char *code;
 327   size_t size;
 328 };
 329
 330 /* Target data image information.  */
 331
 332 typedef struct nvptx_tdata
 333 {
 334   const struct targ_ptx_obj *ptx_objs;
 335   unsigned ptx_num;
 336
 337   const char *const *var_names;
 338   unsigned var_num;
 339
 340   const struct targ_fn_launch *fn_descs;
 341   unsigned fn_num;
 342 } nvptx_tdata_t;
 343
 344 /* Descriptor of a loaded function.  */
 345
 346 struct targ_fn_descriptor
 347 {
 348   CUfunction fn;
 349   const struct targ_fn_launch *launch;
 350   int regs_per_thread;
 351   int max_threads_per_block;
 352 };
 353
 354 /* A loaded PTX image.  */
 355 struct ptx_image_data
 356 {
 357   const void *target_data;
 358   CUmodule module;
 359
 360   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 361
 362   struct ptx_image_data *next;
 363 };
 364
 365 struct ptx_device
 366 {
 367   CUcontext ctx;
 368   bool ctx_shared;
 369   CUdevice dev;
 370   struct ptx_stream *null_stream;
 371   /* All non-null streams associated with this device (actually context),
 372      either created implicitly or passed in from the user (via
 373      acc_set_cuda_stream).  */
 374   struct ptx_stream *active_streams;
 375   struct {
 376     struct ptx_stream **arr;
 377     int size;
 378   } async_streams;
 379   /* A lock for use when manipulating the above stream list and array.  */
 380   pthread_mutex_t stream_lock;
 381   int ord;
 382   bool overlap;
 383   bool map;
 384   bool concur;
 385   bool mkern;
 386   int  mode;
 387   int clock_khz;
 388   int num_sms;
 389   int regs_per_block;
 390   int regs_per_sm;
 391   int warp_size;
 392   int max_threads_per_block;
 393   int max_threads_per_multiprocessor;
 394   int default_dims[GOMP_DIM_MAX];
 395
 396   struct ptx_image_data *images;  /* Images loaded on device.  */
 397   pthread_mutex_t image_lock;     /* Lock for above list.  */
 398
 399   struct ptx_device *next;
 400 };
 401
 402 enum ptx_event_type
 403 {
 404   PTX_EVT_MEM,
 405   PTX_EVT_KNL,
 406   PTX_EVT_SYNC,
 407   PTX_EVT_ASYNC_CLEANUP
 408 };
 409
 410 struct ptx_event
 411 {
 412   CUevent *evt;
 413   int type;
 414   void *addr;
 415   int ord;
 416   int val;
 417
 418   struct ptx_event *next;
 419 };
 420
 421 static pthread_mutex_t ptx_event_lock;
 422 static struct ptx_event *ptx_events;
 423
 424 static struct ptx_device **ptx_devices;
 425
 426 static inline struct nvptx_thread *
 427 nvptx_thread (void)
 428 {
 429   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 430 }
 431
 432 static bool
 433 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 434 {
 435   int i;
 436   struct ptx_stream *null_stream
 437     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 438
 439   null_stream->stream = NULL;
 440   null_stream->host_thread = pthread_self ();
 441   null_stream->multithreaded = true;
 442   null_stream->d = (CUdeviceptr) NULL;
 443   null_stream->h = NULL;
 444   if (!map_init (null_stream))
 445     return false;
 446
 447   ptx_dev->null_stream = null_stream;
 448   ptx_dev->active_streams = NULL;
 449   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 450
 451   if (concurrency < 1)
 452     concurrency = 1;
 453
 454   /* This is just a guess -- make space for as many async streams as the
 455      current device is capable of concurrently executing.  This can grow
 456      later as necessary.  No streams are created yet.  */
 457   ptx_dev->async_streams.arr
 458     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 459   ptx_dev->async_streams.size = concurrency;
 460
 461   for (i = 0; i < concurrency; i++)
 462     ptx_dev->async_streams.arr[i] = NULL;
 463
 464   return true;
 465 }
 466
 467 static bool
 468 fini_streams_for_device (struct ptx_device *ptx_dev)
 469 {
 470   free (ptx_dev->async_streams.arr);
 471
 472   bool ret = true;
 473   while (ptx_dev->active_streams != NULL)
 474     {
 475       struct ptx_stream *s = ptx_dev->active_streams;
 476       ptx_dev->active_streams = ptx_dev->active_streams->next;
 477
 478       ret &= map_fini (s);
 479
 480       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 481       if (r != CUDA_SUCCESS)
 482         {
 483           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 484           ret = false;
 485         }
 486       free (s);
 487     }
 488
 489   ret &= map_fini (ptx_dev->null_stream);
 490   free (ptx_dev->null_stream);
 491   return ret;
 492 }
 493
 494 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 495    thread THREAD (and also current device/context).  If CREATE is true, create
 496    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 497    associate the stream with the same thread argument.  Returns stream to use
 498    as result.  */
 499
 500 static struct ptx_stream *
 501 select_stream_for_async (int async, pthread_t thread, bool create,
 502                          CUstream existing)
 503 {
 504   struct nvptx_thread *nvthd = nvptx_thread ();
 505   /* Local copy of TLS variable.  */
 506   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 507   struct ptx_stream *stream = NULL;
 508   int orig_async = async;
 509
 510   /* The special value acc_async_noval (-1) maps (for now) to an
 511      implicitly-created stream, which is then handled the same as any other
 512      numbered async stream.  Other options are available, e.g. using the null
 513      stream for anonymous async operations, or choosing an idle stream from an
 514      active set.  But, stick with this for now.  */
 515   if (async > acc_async_sync)
 516     async++;
 517
 518   if (create)
 519     pthread_mutex_lock (&ptx_dev->stream_lock);
 520
 521   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 522      null stream, and in fact better performance may be obtainable if it doesn't
 523      (because the null stream enforces overly-strict synchronisation with
 524      respect to other streams for legacy reasons, and that's probably not
 525      needed with OpenACC).  Maybe investigate later.  */
 526   if (async == acc_async_sync)
 527     stream = ptx_dev->null_stream;
 528   else if (async >= 0 && async < ptx_dev->async_streams.size
 529            && ptx_dev->async_streams.arr[async] && !(create && existing))
 530     stream = ptx_dev->async_streams.arr[async];
 531   else if (async >= 0 && create)
 532     {
 533       if (async >= ptx_dev->async_streams.size)
 534         {
 535           int i, newsize = ptx_dev->async_streams.size * 2;
 536
 537           if (async >= newsize)
 538             newsize = async + 1;
 539
 540           ptx_dev->async_streams.arr
 541             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 542                                    newsize * sizeof (struct ptx_stream *));
 543
 544           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 545             ptx_dev->async_streams.arr[i] = NULL;
 546
 547           ptx_dev->async_streams.size = newsize;
 548         }
 549
 550       /* Create a new stream on-demand if there isn't one already, or if we're
 551          setting a particular async value to an existing (externally-provided)
 552          stream.  */
 553       if (!ptx_dev->async_streams.arr[async] || existing)
 554         {
 555           CUresult r;
 556           struct ptx_stream *s
 557             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 558
 559           if (existing)
 560             s->stream = existing;
 561           else
 562             {
 563               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 564                                      CU_STREAM_DEFAULT);
 565               if (r != CUDA_SUCCESS)
 566                 {
 567                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 568                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 569                                      cuda_error (r));
 570                 }
 571             }
 572
 573           /* If CREATE is true, we're going to be queueing some work on this
 574              stream.  Associate it with the current host thread.  */
 575           s->host_thread = thread;
 576           s->multithreaded = false;
 577
 578           s->d = (CUdeviceptr) NULL;
 579           s->h = NULL;
 580           if (!map_init (s))
 581             {
 582               pthread_mutex_unlock (&ptx_dev->stream_lock);
 583               GOMP_PLUGIN_fatal ("map_init fail");
 584             }
 585
 586           s->next = ptx_dev->active_streams;
 587           ptx_dev->active_streams = s;
 588           ptx_dev->async_streams.arr[async] = s;
 589         }
 590
 591       stream = ptx_dev->async_streams.arr[async];
 592     }
 593   else if (async < 0)
 594     {
 595       if (create)
 596         pthread_mutex_unlock (&ptx_dev->stream_lock);
 597       GOMP_PLUGIN_fatal ("bad async %d", async);
 598     }
 599
 600   if (create)
 601     {
 602       assert (stream != NULL);
 603
 604       /* If we're trying to use the same stream from different threads
 605          simultaneously, set stream->multithreaded to true.  This affects the
 606          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 607          only wait for asynchronous launches from the same host thread they are
 608          invoked on.  If multiple threads use the same async value, we make note
 609          of that here and fall back to testing/waiting for all threads in those
 610          functions.  */
 611       if (thread != stream->host_thread)
 612         stream->multithreaded = true;
 613
 614       pthread_mutex_unlock (&ptx_dev->stream_lock);
 615     }
 616   else if (stream && !stream->multithreaded
 617            && !pthread_equal (stream->host_thread, thread))
 618     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 619
 620   return stream;
 621 }
 622
 623 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 624    should be locked on entry and remains locked on exit.  */
 625
 626 static bool
 627 nvptx_init (void)
 628 {
 629   int ndevs;
 630
 631   if (instantiated_devices != 0)
 632     return true;
 633
 634   ptx_events = NULL;
 635   pthread_mutex_init (&ptx_event_lock, NULL);
 636
 637   if (!init_cuda_lib ())
 638     return false;
 639
 640   CUDA_CALL (cuInit, 0);
 641
 642   CUDA_CALL (cuDeviceGetCount, &ndevs);
 643   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 644                                             * ndevs);
 645   return true;
 646 }
 647
 648 /* Select the N'th PTX device for the current host thread.  The device must
 649    have been previously opened before calling this function.  */
 650
 651 static bool
 652 nvptx_attach_host_thread_to_device (int n)
 653 {
 654   CUdevice dev;
 655   CUresult r;
 656   struct ptx_device *ptx_dev;
 657   CUcontext thd_ctx;
 658
 659   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 660   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 661     {
 662       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 663       return false;
 664     }
 665
 666   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 667     return true;
 668   else
 669     {
 670       CUcontext old_ctx;
 671
 672       ptx_dev = ptx_devices[n];
 673       if (!ptx_dev)
 674         {
 675           GOMP_PLUGIN_error ("device %d not found", n);
 676           return false;
 677         }
 678
 679       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 680
 681       /* We don't necessarily have a current context (e.g. if it has been
 682          destroyed.  Pop it if we do though.  */
 683       if (thd_ctx != NULL)
 684         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 685
 686       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 687     }
 688   return true;
 689 }
 690
 691 static struct ptx_device *
 692 nvptx_open_device (int n)
 693 {
 694   struct ptx_device *ptx_dev;
 695   CUdevice dev, ctx_dev;
 696   CUresult r;
 697   int async_engines, pi;
 698
 699   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 700
 701   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 702
 703   ptx_dev->ord = n;
 704   ptx_dev->dev = dev;
 705   ptx_dev->ctx_shared = false;
 706
 707   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 708   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 709     {
 710       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 711       return NULL;
 712     }
 713
 714   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 715     {
 716       /* The current host thread has an active context for a different device.
 717          Detach it.  */
 718       CUcontext old_ctx;
 719       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 720     }
 721
 722   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 723
 724   if (!ptx_dev->ctx)
 725     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 726   else
 727     ptx_dev->ctx_shared = true;
 728
 729   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 730                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 731   ptx_dev->overlap = pi;
 732
 733   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 734                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 735   ptx_dev->map = pi;
 736
 737   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 738                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 739   ptx_dev->concur = pi;
 740
 741   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 742                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 743   ptx_dev->mode = pi;
 744
 745   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 746                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 747   ptx_dev->mkern = pi;
 748
 749   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 750                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 751   ptx_dev->clock_khz = pi;
 752
 753   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 754                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 755   ptx_dev->num_sms = pi;
 756
 757   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 758                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 759   ptx_dev->regs_per_block = pi;
 760
 761   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 762      in CUDA 6.0 and newer.  */
 763   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 764                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 765                          dev);
 766   /* Fallback: use limit of registers per block, which is usually equal.  */
 767   if (r == CUDA_ERROR_INVALID_VALUE)
 768     pi = ptx_dev->regs_per_block;
 769   else if (r != CUDA_SUCCESS)
 770     {
 771       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 772       return NULL;
 773     }
 774   ptx_dev->regs_per_sm = pi;
 775
 776   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 777                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 778   if (pi != 32)
 779     {
 780       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 781       return NULL;
 782     }
 783   ptx_dev->warp_size = pi;
 784
 785   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 786                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 787   ptx_dev->max_threads_per_block = pi;
 788
 789   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 790                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 791   ptx_dev->max_threads_per_multiprocessor = pi;
 792
 793   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 794                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 795   if (r != CUDA_SUCCESS)
 796     async_engines = 1;
 797
 798   for (int i = 0; i != GOMP_DIM_MAX; i++)
 799     ptx_dev->default_dims[i] = 0;
 800
 801   ptx_dev->images = NULL;
 802   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 803
 804   if (!init_streams_for_device (ptx_dev, async_engines))
 805     return NULL;
 806
 807   return ptx_dev;
 808 }
 809
 810 static bool
 811 nvptx_close_device (struct ptx_device *ptx_dev)
 812 {
 813   if (!ptx_dev)
 814     return true;
 815
 816   if (!fini_streams_for_device (ptx_dev))
 817     return false;
 818
 819   pthread_mutex_destroy (&ptx_dev->image_lock);
 820
 821   if (!ptx_dev->ctx_shared)
 822     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 823
 824   free (ptx_dev);
 825   return true;
 826 }
 827
 828 static int
 829 nvptx_get_num_devices (void)
 830 {
 831   int n;
 832
 833   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 834      configurations.  */
 835   if (sizeof (void *) != 8)
 836     {
 837       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 838                          " only 64-bit configurations are supported\n");
 839       return 0;
 840     }
 841
 842   /* This function will be called before the plugin has been initialized in
 843      order to enumerate available devices, but CUDA API routines can't be used
 844      until cuInit has been called.  Just call it now (but don't yet do any
 845      further initialization).  */
 846   if (instantiated_devices == 0)
 847     {
 848       if (!init_cuda_lib ())
 849         return 0;
 850       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 851       /* This is not an error: e.g. we may have CUDA libraries installed but
 852          no devices available.  */
 853       if (r != CUDA_SUCCESS)
 854         {
 855           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 856                              cuda_error (r));
 857           return 0;
 858         }
 859     }
 860
 861   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 862   return n;
 863 }
 864
 865 static void
 866 notify_var (const char *var_name, const char *env_var)
 867 {
 868   if (env_var == NULL)
 869     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 870   else
 871     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 872 }
 873
 874 static void
 875 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 876 {
 877   const char *var_name = "GOMP_NVPTX_JIT";
 878   const char *env_var = secure_getenv (var_name);
 879   notify_var (var_name, env_var);
 880
 881   if (env_var == NULL)
 882     return;
 883
 884   const char *c = env_var;
 885   while (*c != '\0')
 886     {
 887       while (*c == ' ')
 888         c++;
 889
 890       if (c[0] == '-' && c[1] == 'O'
 891           && '0' <= c[2] && c[2] <= '4'
 892           && (c[3] == '\0' || c[3] == ' '))
 893         {
 894           *gomp_nvptx_o = c[2] - '0';
 895           c += 3;
 896           continue;
 897         }
 898
 899       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 900       break;
 901     }
 902 }
 903
 904 static bool
 905 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 906           unsigned num_objs)
 907 {
 908   CUjit_option opts[7];
 909   void *optvals[7];
 910   float elapsed = 0.0;
 911   char elog[1024];
 912   char ilog[16384];
 913   CUlinkState linkstate;
 914   CUresult r;
 915   void *linkout;
 916   size_t linkoutsize __attribute__ ((unused));
 917
 918   opts[0] = CU_JIT_WALL_TIME;
 919   optvals[0] = &elapsed;
 920
 921   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 922   optvals[1] = &ilog[0];
 923
 924   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 925   optvals[2] = (void *) sizeof ilog;
 926
 927   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 928   optvals[3] = &elog[0];
 929
 930   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 931   optvals[4] = (void *) sizeof elog;
 932
 933   opts[5] = CU_JIT_LOG_VERBOSE;
 934   optvals[5] = (void *) 1;
 935
 936   static intptr_t gomp_nvptx_o = -1;
 937
 938   static bool init_done = false;
 939   if (!init_done)
 940     {
 941       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 942       init_done = true;
 943   }
 944
 945   int nopts = 6;
 946   if (gomp_nvptx_o != -1)
 947     {
 948       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 949       optvals[nopts] = (void *) gomp_nvptx_o;
 950       nopts++;
 951     }
 952
 953   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 954     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 955   else
 956     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 957
 958   for (; num_objs--; ptx_objs++)
 959     {
 960       /* cuLinkAddData's 'data' argument erroneously omits the const
 961          qualifier.  */
 962       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 963       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 964         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 965                                (char *) ptx_objs->code, ptx_objs->size,
 966                                0, 0, 0, 0);
 967       else
 968         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 969                                (char *) ptx_objs->code, ptx_objs->size,
 970                                0, 0, 0, 0);
 971       if (r != CUDA_SUCCESS)
 972         {
 973           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 974           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 975                              cuda_error (r));
 976           return false;
 977         }
 978     }
 979
 980   GOMP_PLUGIN_debug (0, "Linking\n");
 981   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 982
 983   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 984   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 985
 986   if (r != CUDA_SUCCESS)
 987     {
 988       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 989       return false;
 990     }
 991
 992   CUDA_CALL (cuModuleLoadData, module, linkout);
 993   CUDA_CALL (cuLinkDestroy, linkstate);
 994   return true;
 995 }
 996
 997 static void
 998 event_gc (bool memmap_lockable)
 999 {
1000   struct ptx_event *ptx_event = ptx_events;
1001   struct ptx_event *async_cleanups = NULL;
1002   struct nvptx_thread *nvthd = nvptx_thread ();
1003
1004   pthread_mutex_lock (&ptx_event_lock);
1005
1006   while (ptx_event != NULL)
1007     {
1008       CUresult r;
1009       struct ptx_event *e = ptx_event;
1010
1011       ptx_event = ptx_event->next;
1012
1013       if (e->ord != nvthd->ptx_dev->ord)
1014         continue;
1015
1016       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1017       if (r == CUDA_SUCCESS)
1018         {
1019           bool append_async = false;
1020           CUevent *te;
1021
1022           te = e->evt;
1023
1024           switch (e->type)
1025             {
1026             case PTX_EVT_MEM:
1027             case PTX_EVT_SYNC:
1028               break;
1029
1030             case PTX_EVT_KNL:
1031               map_pop (e->addr);
1032               break;
1033
1034             case PTX_EVT_ASYNC_CLEANUP:
1035               {
1036                 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037                    memory-map splay tree lock for the current device, so we
1038                    can't call it when one of our callers has already claimed
1039                    the lock.  In that case, just delay the GC for this event
1040                    until later.  */
1041                 if (!memmap_lockable)
1042                   continue;
1043
1044                 append_async = true;
1045               }
1046               break;
1047             }
1048
1049           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1050           free ((void *)te);
1051
1052           /* Unlink 'e' from ptx_events list.  */
1053           if (ptx_events == e)
1054             ptx_events = ptx_events->next;
1055           else
1056             {
1057               struct ptx_event *e_ = ptx_events;
1058               while (e_->next != e)
1059                 e_ = e_->next;
1060               e_->next = e_->next->next;
1061             }
1062
1063           if (append_async)
1064             {
1065               e->next = async_cleanups;
1066               async_cleanups = e;
1067             }
1068           else
1069             free (e);
1070         }
1071     }
1072
1073   pthread_mutex_unlock (&ptx_event_lock);
1074
1075   /* We have to do these here, after ptx_event_lock is released.  */
1076   while (async_cleanups)
1077     {
1078       struct ptx_event *e = async_cleanups;
1079       async_cleanups = async_cleanups->next;
1080
1081       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1082       free (e);
1083     }
1084 }
1085
1086 static void
1087 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1088 {
1089   struct ptx_event *ptx_event;
1090   struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1093           || type == PTX_EVT_ASYNC_CLEANUP);
1094
1095   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1096   ptx_event->type = type;
1097   ptx_event->evt = e;
1098   ptx_event->addr = h;
1099   ptx_event->ord = nvthd->ptx_dev->ord;
1100   ptx_event->val = val;
1101
1102   pthread_mutex_lock (&ptx_event_lock);
1103
1104   ptx_event->next = ptx_events;
1105   ptx_events = ptx_event;
1106
1107   pthread_mutex_unlock (&ptx_event_lock);
1108 }
1109
1110 static void
1111 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1112             int async, unsigned *dims, void *targ_mem_desc)
1113 {
1114   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1115   CUfunction function;
1116   CUresult r;
1117   int i;
1118   struct ptx_stream *dev_str;
1119   void *kargs[1];
1120   void *hp, *dp;
1121   struct nvptx_thread *nvthd = nvptx_thread ();
1122   int warp_size = nvthd->ptx_dev->warp_size;
1123   const char *maybe_abort_msg = "(perhaps abort was called)";
1124
1125   function = targ_fn->fn;
1126
1127   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1128   assert (dev_str == nvthd->current_stream);
1129
1130   /* Initialize the launch dimensions.  Typically this is constant,
1131      provided by the device compiler, but we must permit runtime
1132      values.  */
1133   int seen_zero = 0;
1134   for (i = 0; i != GOMP_DIM_MAX; i++)
1135     {
1136       if (targ_fn->launch->dim[i])
1137        dims[i] = targ_fn->launch->dim[i];
1138       if (!dims[i])
1139        seen_zero = 1;
1140     }
1141
1142   if (seen_zero)
1143     {
1144       pthread_mutex_lock (&ptx_dev_lock);
1145
1146       static int gomp_openacc_dims[GOMP_DIM_MAX];
1147       if (!gomp_openacc_dims[0])
1148         {
1149           /* See if the user provided GOMP_OPENACC_DIM environment
1150              variable to specify runtime defaults.  */
1151           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1152             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1153         }
1154
1155       if (!nvthd->ptx_dev->default_dims[0])
1156         {
1157           int default_dims[GOMP_DIM_MAX];
1158           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1159             default_dims[i] = gomp_openacc_dims[i];
1160
1161           int gang, worker, vector;
1162           {
1163             int block_size = nvthd->ptx_dev->max_threads_per_block;
1164             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1165             int dev_size = nvthd->ptx_dev->num_sms;
1166             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1167                                " dev_size=%d, cpu_size=%d\n",
1168                                warp_size, block_size, dev_size, cpu_size);
1169
1170             gang = (cpu_size / block_size) * dev_size;
1171             worker = block_size / warp_size;
1172             vector = warp_size;
1173           }
1174
1175           /* There is no upper bound on the gang size.  The best size
1176              matches the hardware configuration.  Logical gangs are
1177              scheduled onto physical hardware.  To maximize usage, we
1178              should guess a large number.  */
1179           if (default_dims[GOMP_DIM_GANG] < 1)
1180             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1181           /* The worker size must not exceed the hardware.  */
1182           if (default_dims[GOMP_DIM_WORKER] < 1
1183               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1184             default_dims[GOMP_DIM_WORKER] = worker;
1185           /* The vector size must exactly match the hardware.  */
1186           if (default_dims[GOMP_DIM_VECTOR] < 1
1187               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1188             default_dims[GOMP_DIM_VECTOR] = vector;
1189
1190           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1191                              default_dims[GOMP_DIM_GANG],
1192                              default_dims[GOMP_DIM_WORKER],
1193                              default_dims[GOMP_DIM_VECTOR]);
1194
1195           for (i = 0; i != GOMP_DIM_MAX; i++)
1196             nvthd->ptx_dev->default_dims[i] = default_dims[i];
1197         }
1198       pthread_mutex_unlock (&ptx_dev_lock);
1199
1200       {
1201         bool default_dim_p[GOMP_DIM_MAX];
1202         for (i = 0; i != GOMP_DIM_MAX; i++)
1203           {
1204             default_dim_p[i] = !dims[i];
1205             if (default_dim_p[i])
1206               dims[i] = nvthd->ptx_dev->default_dims[i];
1207           }
1208
1209         if (default_dim_p[GOMP_DIM_VECTOR])
1210           dims[GOMP_DIM_VECTOR]
1211             = MIN (dims[GOMP_DIM_VECTOR],
1212                    (targ_fn->max_threads_per_block / warp_size * warp_size));
1213
1214         if (default_dim_p[GOMP_DIM_WORKER])
1215           dims[GOMP_DIM_WORKER]
1216             = MIN (dims[GOMP_DIM_WORKER],
1217                    targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1218       }
1219     }
1220
1221   /* Check if the accelerator has sufficient hardware resources to
1222      launch the offloaded kernel.  */
1223   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1224       > targ_fn->max_threads_per_block)
1225     {
1226       int suggest_workers
1227         = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1228       GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1229                          " launch '%s' with num_workers = %d; recompile the"
1230                          " program with 'num_workers = %d' on that offloaded"
1231                          " region or '-fopenacc-dim=:%d'",
1232                          targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1233                          suggest_workers, suggest_workers);
1234     }
1235
1236   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1237      the host and the device. HP is a host pointer to the new chunk, and DP is
1238      the corresponding device pointer.  */
1239   map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
1240
1241   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1242
1243   /* Copy the array of arguments to the mapped page.  */
1244   for (i = 0; i < mapnum; i++)
1245     ((void **) hp)[i] = devaddrs[i];
1246
1247   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1248      fact have the same value on a unified-memory system).  */
1249   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1250                     mapnum * sizeof (void *));
1251   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1252                      " gangs=%u, workers=%u, vectors=%u\n",
1253                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1254                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1255
1256   // OpenACC            CUDA
1257   //
1258   // num_gangs          nctaid.x
1259   // num_workers        ntid.y
1260   // vector length      ntid.x
1261
1262   kargs[0] = &dp;
1263   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1264                     dims[GOMP_DIM_GANG], 1, 1,
1265                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1266                     0, dev_str->stream, kargs, 0);
1267
1268 #ifndef DISABLE_ASYNC
1269   if (async < acc_async_noval)
1270     {
1271       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1272       if (r == CUDA_ERROR_LAUNCH_FAILED)
1273         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1274                            maybe_abort_msg);
1275       else if (r != CUDA_SUCCESS)
1276         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1277     }
1278   else
1279     {
1280       CUevent *e;
1281
1282       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1283
1284       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1285       if (r == CUDA_ERROR_LAUNCH_FAILED)
1286         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1287                            maybe_abort_msg);
1288       else if (r != CUDA_SUCCESS)
1289         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1290
1291       event_gc (true);
1292
1293       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1294
1295       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1296     }
1297 #else
1298   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1299   if (r == CUDA_ERROR_LAUNCH_FAILED)
1300     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1301                        maybe_abort_msg);
1302   else if (r != CUDA_SUCCESS)
1303     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1304 #endif
1305
1306   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1307                      targ_fn->launch->fn);
1308
1309 #ifndef DISABLE_ASYNC
1310   if (async < acc_async_noval)
1311 #endif
1312     map_pop (dev_str);
1313 }
1314
1315 void * openacc_get_current_cuda_context (void);
1316
1317 static void *
1318 nvptx_alloc (size_t s)
1319 {
1320   CUdeviceptr d;
1321
1322   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1323   return (void *) d;
1324 }
1325
1326 static bool
1327 nvptx_free (void *p)
1328 {
1329   CUdeviceptr pb;
1330   size_t ps;
1331
1332   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1333   if ((CUdeviceptr) p != pb)
1334     {
1335       GOMP_PLUGIN_error ("invalid device address");
1336       return false;
1337     }
1338
1339   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1340   return true;
1341 }
1342
1343
1344 static bool
1345 nvptx_host2dev (void *d, const void *h, size_t s)
1346 {
1347   CUdeviceptr pb;
1348   size_t ps;
1349   struct nvptx_thread *nvthd = nvptx_thread ();
1350
1351   if (!s)
1352     return true;
1353   if (!d)
1354     {
1355       GOMP_PLUGIN_error ("invalid device address");
1356       return false;
1357     }
1358
1359   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1360
1361   if (!pb)
1362     {
1363       GOMP_PLUGIN_error ("invalid device address");
1364       return false;
1365     }
1366   if (!h)
1367     {
1368       GOMP_PLUGIN_error ("invalid host address");
1369       return false;
1370     }
1371   if (d == h)
1372     {
1373       GOMP_PLUGIN_error ("invalid host or device address");
1374       return false;
1375     }
1376   if ((void *)(d + s) > (void *)(pb + ps))
1377     {
1378       GOMP_PLUGIN_error ("invalid size");
1379       return false;
1380     }
1381
1382 #ifndef DISABLE_ASYNC
1383   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1384     {
1385       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1386       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1387       event_gc (false);
1388       CUDA_CALL (cuMemcpyHtoDAsync,
1389                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1390       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1391       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1392     }
1393   else
1394 #endif
1395     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1396
1397   return true;
1398 }
1399
1400 static bool
1401 nvptx_dev2host (void *h, const void *d, size_t s)
1402 {
1403   CUdeviceptr pb;
1404   size_t ps;
1405   struct nvptx_thread *nvthd = nvptx_thread ();
1406
1407   if (!s)
1408     return true;
1409   if (!d)
1410     {
1411       GOMP_PLUGIN_error ("invalid device address");
1412       return false;
1413     }
1414
1415   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1416
1417   if (!pb)
1418     {
1419       GOMP_PLUGIN_error ("invalid device address");
1420       return false;
1421     }
1422   if (!h)
1423     {
1424       GOMP_PLUGIN_error ("invalid host address");
1425       return false;
1426     }
1427   if (d == h)
1428     {
1429       GOMP_PLUGIN_error ("invalid host or device address");
1430       return false;
1431     }
1432   if ((void *)(d + s) > (void *)(pb + ps))
1433     {
1434       GOMP_PLUGIN_error ("invalid size");
1435       return false;
1436     }
1437
1438 #ifndef DISABLE_ASYNC
1439   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1440     {
1441       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1442       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1443       event_gc (false);
1444       CUDA_CALL (cuMemcpyDtoHAsync,
1445                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1446       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1447       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1448     }
1449   else
1450 #endif
1451     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1452
1453   return true;
1454 }
1455
1456 static void
1457 nvptx_set_async (int async)
1458 {
1459   struct nvptx_thread *nvthd = nvptx_thread ();
1460   nvthd->current_stream
1461     = select_stream_for_async (async, pthread_self (), true, NULL);
1462 }
1463
1464 static int
1465 nvptx_async_test (int async)
1466 {
1467   CUresult r;
1468   struct ptx_stream *s;
1469
1470   s = select_stream_for_async (async, pthread_self (), false, NULL);
1471
1472   if (!s)
1473     GOMP_PLUGIN_fatal ("unknown async %d", async);
1474
1475   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1476   if (r == CUDA_SUCCESS)
1477     {
1478       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1479          whether all work has completed on this stream, and if so omits the call
1480          to the wait hook.  If that happens, event_gc might not get called
1481          (which prevents variables from getting unmapped and their associated
1482          device storage freed), so call it here.  */
1483       event_gc (true);
1484       return 1;
1485     }
1486   else if (r == CUDA_ERROR_NOT_READY)
1487     return 0;
1488
1489   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1490
1491   return 0;
1492 }
1493
1494 static int
1495 nvptx_async_test_all (void)
1496 {
1497   struct ptx_stream *s;
1498   pthread_t self = pthread_self ();
1499   struct nvptx_thread *nvthd = nvptx_thread ();
1500
1501   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1502
1503   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1504     {
1505       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1506           && CUDA_CALL_NOCHECK (cuStreamQuery,
1507                                 s->stream) == CUDA_ERROR_NOT_READY)
1508         {
1509           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1510           return 0;
1511         }
1512     }
1513
1514   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1515
1516   event_gc (true);
1517
1518   return 1;
1519 }
1520
1521 static void
1522 nvptx_wait (int async)
1523 {
1524   struct ptx_stream *s;
1525
1526   s = select_stream_for_async (async, pthread_self (), false, NULL);
1527   if (!s)
1528     GOMP_PLUGIN_fatal ("unknown async %d", async);
1529
1530   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1531
1532   event_gc (true);
1533 }
1534
1535 static void
1536 nvptx_wait_async (int async1, int async2)
1537 {
1538   CUevent *e;
1539   struct ptx_stream *s1, *s2;
1540   pthread_t self = pthread_self ();
1541
1542   /* The stream that is waiting (rather than being waited for) doesn't
1543      necessarily have to exist already.  */
1544   s2 = select_stream_for_async (async2, self, true, NULL);
1545
1546   s1 = select_stream_for_async (async1, self, false, NULL);
1547   if (!s1)
1548     GOMP_PLUGIN_fatal ("invalid async 1\n");
1549
1550   if (s1 == s2)
1551     GOMP_PLUGIN_fatal ("identical parameters");
1552
1553   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1554
1555   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1556
1557   event_gc (true);
1558
1559   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1560
1561   event_add (PTX_EVT_SYNC, e, NULL, 0);
1562
1563   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1564 }
1565
1566 static void
1567 nvptx_wait_all (void)
1568 {
1569   CUresult r;
1570   struct ptx_stream *s;
1571   pthread_t self = pthread_self ();
1572   struct nvptx_thread *nvthd = nvptx_thread ();
1573
1574   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1575
1576   /* Wait for active streams initiated by this thread (or by multiple threads)
1577      to complete.  */
1578   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1579     {
1580       if (s->multithreaded || pthread_equal (s->host_thread, self))
1581         {
1582           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1583           if (r == CUDA_SUCCESS)
1584             continue;
1585           else if (r != CUDA_ERROR_NOT_READY)
1586             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1587
1588           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1589         }
1590     }
1591
1592   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1593
1594   event_gc (true);
1595 }
1596
1597 static void
1598 nvptx_wait_all_async (int async)
1599 {
1600   struct ptx_stream *waiting_stream, *other_stream;
1601   CUevent *e;
1602   struct nvptx_thread *nvthd = nvptx_thread ();
1603   pthread_t self = pthread_self ();
1604
1605   /* The stream doing the waiting.  This could be the first mention of the
1606      stream, so create it if necessary.  */
1607   waiting_stream
1608     = select_stream_for_async (async, pthread_self (), true, NULL);
1609
1610   /* Launches on the null stream already block on other streams in the
1611      context.  */
1612   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1613     return;
1614
1615   event_gc (true);
1616
1617   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1618
1619   for (other_stream = nvthd->ptx_dev->active_streams;
1620        other_stream != NULL;
1621        other_stream = other_stream->next)
1622     {
1623       if (!other_stream->multithreaded
1624           && !pthread_equal (other_stream->host_thread, self))
1625         continue;
1626
1627       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1628
1629       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1630
1631       /* Record an event on the waited-for stream.  */
1632       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1633
1634       event_add (PTX_EVT_SYNC, e, NULL, 0);
1635
1636       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1637    }
1638
1639   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1640 }
1641
1642 static void *
1643 nvptx_get_current_cuda_device (void)
1644 {
1645   struct nvptx_thread *nvthd = nvptx_thread ();
1646
1647   if (!nvthd || !nvthd->ptx_dev)
1648     return NULL;
1649
1650   return &nvthd->ptx_dev->dev;
1651 }
1652
1653 static void *
1654 nvptx_get_current_cuda_context (void)
1655 {
1656   struct nvptx_thread *nvthd = nvptx_thread ();
1657
1658   if (!nvthd || !nvthd->ptx_dev)
1659     return NULL;
1660
1661   return nvthd->ptx_dev->ctx;
1662 }
1663
1664 static void *
1665 nvptx_get_cuda_stream (int async)
1666 {
1667   struct ptx_stream *s;
1668   struct nvptx_thread *nvthd = nvptx_thread ();
1669
1670   if (!nvthd || !nvthd->ptx_dev)
1671     return NULL;
1672
1673   s = select_stream_for_async (async, pthread_self (), false, NULL);
1674
1675   return s ? s->stream : NULL;
1676 }
1677
1678 static int
1679 nvptx_set_cuda_stream (int async, void *stream)
1680 {
1681   struct ptx_stream *oldstream;
1682   pthread_t self = pthread_self ();
1683   struct nvptx_thread *nvthd = nvptx_thread ();
1684
1685   if (async < 0)
1686     GOMP_PLUGIN_fatal ("bad async %d", async);
1687
1688   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1689
1690   /* We have a list of active streams and an array mapping async values to
1691      entries of that list.  We need to take "ownership" of the passed-in stream,
1692      and add it to our list, removing the previous entry also (if there was one)
1693      in order to prevent resource leaks.  Note the potential for surprise
1694      here: maybe we should keep track of passed-in streams and leave it up to
1695      the user to tidy those up, but that doesn't work for stream handles
1696      returned from acc_get_cuda_stream above...  */
1697
1698   oldstream = select_stream_for_async (async, self, false, NULL);
1699
1700   if (oldstream)
1701     {
1702       if (nvthd->ptx_dev->active_streams == oldstream)
1703         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1704       else
1705         {
1706           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1707           while (s->next != oldstream)
1708             s = s->next;
1709           s->next = s->next->next;
1710         }
1711
1712       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1713
1714       if (!map_fini (oldstream))
1715         GOMP_PLUGIN_fatal ("error when freeing host memory");
1716
1717       free (oldstream);
1718     }
1719
1720   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1721
1722   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1723
1724   return 1;
1725 }
1726
1727 /* Plugin entry points.  */
1728
1729 const char *
1730 GOMP_OFFLOAD_get_name (void)
1731 {
1732   return "nvptx";
1733 }
1734
1735 unsigned int
1736 GOMP_OFFLOAD_get_caps (void)
1737 {
1738   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1739 }
1740
1741 int
1742 GOMP_OFFLOAD_get_type (void)
1743 {
1744   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1745 }
1746
1747 int
1748 GOMP_OFFLOAD_get_num_devices (void)
1749 {
1750   return nvptx_get_num_devices ();
1751 }
1752
1753 bool
1754 GOMP_OFFLOAD_init_device (int n)
1755 {
1756   struct ptx_device *dev;
1757
1758   pthread_mutex_lock (&ptx_dev_lock);
1759
1760   if (!nvptx_init () || ptx_devices[n] != NULL)
1761     {
1762       pthread_mutex_unlock (&ptx_dev_lock);
1763       return false;
1764     }
1765
1766   dev = nvptx_open_device (n);
1767   if (dev)
1768     {
1769       ptx_devices[n] = dev;
1770       instantiated_devices++;
1771     }
1772
1773   pthread_mutex_unlock (&ptx_dev_lock);
1774
1775   return dev != NULL;
1776 }
1777
1778 bool
1779 GOMP_OFFLOAD_fini_device (int n)
1780 {
1781   pthread_mutex_lock (&ptx_dev_lock);
1782
1783   if (ptx_devices[n] != NULL)
1784     {
1785       if (!nvptx_attach_host_thread_to_device (n)
1786           || !nvptx_close_device (ptx_devices[n]))
1787         {
1788           pthread_mutex_unlock (&ptx_dev_lock);
1789           return false;
1790         }
1791       ptx_devices[n] = NULL;
1792       instantiated_devices--;
1793     }
1794
1795   pthread_mutex_unlock (&ptx_dev_lock);
1796   return true;
1797 }
1798
1799 /* Return the libgomp version number we're compatible with.  There is
1800    no requirement for cross-version compatibility.  */
1801
1802 unsigned
1803 GOMP_OFFLOAD_version (void)
1804 {
1805   return GOMP_VERSION;
1806 }
1807
1808 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1809
1810 static void
1811 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1812 {
1813   CUdeviceptr dptr;
1814   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1815                                   module, "__nvptx_clocktick");
1816   if (r == CUDA_ERROR_NOT_FOUND)
1817     return;
1818   if (r != CUDA_SUCCESS)
1819     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1820   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1821   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1822                          sizeof (__nvptx_clocktick));
1823   if (r != CUDA_SUCCESS)
1824     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1825 }
1826
1827 /* Load the (partial) program described by TARGET_DATA to device
1828    number ORD.  Allocate and return TARGET_TABLE.  */
1829
1830 int
1831 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1832                          struct addr_pair **target_table)
1833 {
1834   CUmodule module;
1835   const char *const *var_names;
1836   const struct targ_fn_launch *fn_descs;
1837   unsigned int fn_entries, var_entries, i, j;
1838   struct targ_fn_descriptor *targ_fns;
1839   struct addr_pair *targ_tbl;
1840   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1841   struct ptx_image_data *new_image;
1842   struct ptx_device *dev;
1843
1844   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1845     {
1846       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1847                          " (expected %u, received %u)",
1848                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1849       return -1;
1850     }
1851
1852   if (!nvptx_attach_host_thread_to_device (ord)
1853       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1854     return -1;
1855
1856   dev = ptx_devices[ord];
1857
1858   /* The mkoffload utility emits a struct of pointers/integers at the
1859      start of each offload image.  The array of kernel names and the
1860      functions addresses form a one-to-one correspondence.  */
1861
1862   var_entries = img_header->var_num;
1863   var_names = img_header->var_names;
1864   fn_entries = img_header->fn_num;
1865   fn_descs = img_header->fn_descs;
1866
1867   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1868                                  * (fn_entries + var_entries));
1869   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1870                                  * fn_entries);
1871
1872   *target_table = targ_tbl;
1873
1874   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1875   new_image->target_data = target_data;
1876   new_image->module = module;
1877   new_image->fns = targ_fns;
1878
1879   pthread_mutex_lock (&dev->image_lock);
1880   new_image->next = dev->images;
1881   dev->images = new_image;
1882   pthread_mutex_unlock (&dev->image_lock);
1883
1884   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1885     {
1886       CUfunction function;
1887       int nregs, mthrs;
1888
1889       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1890                       fn_descs[i].fn);
1891       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1892                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1893       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1894                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1895
1896       targ_fns->fn = function;
1897       targ_fns->launch = &fn_descs[i];
1898       targ_fns->regs_per_thread = nregs;
1899       targ_fns->max_threads_per_block = mthrs;
1900
1901       targ_tbl->start = (uintptr_t) targ_fns;
1902       targ_tbl->end = targ_tbl->start + 1;
1903     }
1904
1905   for (j = 0; j < var_entries; j++, targ_tbl++)
1906     {
1907       CUdeviceptr var;
1908       size_t bytes;
1909
1910       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1911                       &var, &bytes, module, var_names[j]);
1912
1913       targ_tbl->start = (uintptr_t) var;
1914       targ_tbl->end = targ_tbl->start + bytes;
1915     }
1916
1917   nvptx_set_clocktick (module, dev);
1918
1919   return fn_entries + var_entries;
1920 }
1921
1922 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1923    function descriptors allocated by G_O_load_image.  */
1924
1925 bool
1926 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1927 {
1928   struct ptx_image_data *image, **prev_p;
1929   struct ptx_device *dev = ptx_devices[ord];
1930
1931   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1932     {
1933       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1934                          " (expected %u, received %u)",
1935                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1936       return false;
1937     }
1938
1939   bool ret = true;
1940   pthread_mutex_lock (&dev->image_lock);
1941   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1942     if (image->target_data == target_data)
1943       {
1944         *prev_p = image->next;
1945         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1946           ret = false;
1947         free (image->fns);
1948         free (image);
1949         break;
1950       }
1951   pthread_mutex_unlock (&dev->image_lock);
1952   return ret;
1953 }
1954
1955 void *
1956 GOMP_OFFLOAD_alloc (int ord, size_t size)
1957 {
1958   if (!nvptx_attach_host_thread_to_device (ord))
1959     return NULL;
1960   return nvptx_alloc (size);
1961 }
1962
1963 bool
1964 GOMP_OFFLOAD_free (int ord, void *ptr)
1965 {
1966   return (nvptx_attach_host_thread_to_device (ord)
1967           && nvptx_free (ptr));
1968 }
1969
1970 bool
1971 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1972 {
1973   return (nvptx_attach_host_thread_to_device (ord)
1974           && nvptx_dev2host (dst, src, n));
1975 }
1976
1977 bool
1978 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1979 {
1980   return (nvptx_attach_host_thread_to_device (ord)
1981           && nvptx_host2dev (dst, src, n));
1982 }
1983
1984 bool
1985 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1986 {
1987   struct ptx_device *ptx_dev = ptx_devices[ord];
1988   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1989                                 ptx_dev->null_stream->stream);
1990   return true;
1991 }
1992
1993 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1994
1995 void
1996 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1997                            void **hostaddrs, void **devaddrs,
1998                            int async, unsigned *dims, void *targ_mem_desc)
1999 {
2000   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2001 }
2002
2003 void
2004 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2005 {
2006   struct nvptx_thread *nvthd = nvptx_thread ();
2007   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2008
2009   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2010   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2011   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2012 }
2013
2014 int
2015 GOMP_OFFLOAD_openacc_async_test (int async)
2016 {
2017   return nvptx_async_test (async);
2018 }
2019
2020 int
2021 GOMP_OFFLOAD_openacc_async_test_all (void)
2022 {
2023   return nvptx_async_test_all ();
2024 }
2025
2026 void
2027 GOMP_OFFLOAD_openacc_async_wait (int async)
2028 {
2029   nvptx_wait (async);
2030 }
2031
2032 void
2033 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2034 {
2035   nvptx_wait_async (async1, async2);
2036 }
2037
2038 void
2039 GOMP_OFFLOAD_openacc_async_wait_all (void)
2040 {
2041   nvptx_wait_all ();
2042 }
2043
2044 void
2045 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2046 {
2047   nvptx_wait_all_async (async);
2048 }
2049
2050 void
2051 GOMP_OFFLOAD_openacc_async_set_async (int async)
2052 {
2053   nvptx_set_async (async);
2054 }
2055
2056 void *
2057 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2058 {
2059   struct ptx_device *ptx_dev;
2060   struct nvptx_thread *nvthd
2061     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2062   CUcontext thd_ctx;
2063
2064   ptx_dev = ptx_devices[ord];
2065
2066   assert (ptx_dev);
2067
2068   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2069
2070   assert (ptx_dev->ctx);
2071
2072   if (!thd_ctx)
2073     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2074
2075   nvthd->current_stream = ptx_dev->null_stream;
2076   nvthd->ptx_dev = ptx_dev;
2077
2078   return (void *) nvthd;
2079 }
2080
2081 void
2082 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2083 {
2084   free (data);
2085 }
2086
2087 void *
2088 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2089 {
2090   return nvptx_get_current_cuda_device ();
2091 }
2092
2093 void *
2094 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2095 {
2096   return nvptx_get_current_cuda_context ();
2097 }
2098
2099 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2100
2101 void *
2102 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2103 {
2104   return nvptx_get_cuda_stream (async);
2105 }
2106
2107 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2108
2109 int
2110 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2111 {
2112   return nvptx_set_cuda_stream (async, stream);
2113 }
2114
2115 /* Adjust launch dimensions: pick good values for number of blocks and warps
2116    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2117    own limits.  */
2118
2119 static void
2120 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2121                             struct ptx_device *ptx_dev,
2122                             int *teams_p, int *threads_p)
2123 {
2124   int max_warps_block = fn->max_threads_per_block / 32;
2125   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2126      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2127   if (max_warps_block > 32)
2128     max_warps_block = 32;
2129   if (*threads_p <= 0)
2130     *threads_p = 8;
2131   if (*threads_p > max_warps_block)
2132     *threads_p = max_warps_block;
2133
2134   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2135   /* This is an estimate of how many blocks the device can host simultaneously.
2136      Actual limit, which may be lower, can be queried with "occupancy control"
2137      driver interface (since CUDA 6.0).  */
2138   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2139   if (*teams_p <= 0 || *teams_p > max_blocks)
2140     *teams_p = max_blocks;
2141 }
2142
2143 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2144    target regions.  */
2145
2146 static size_t
2147 nvptx_stacks_size ()
2148 {
2149   return 128 * 1024;
2150 }
2151
2152 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2153
2154 static void *
2155 nvptx_stacks_alloc (size_t size, int num)
2156 {
2157   CUdeviceptr stacks;
2158   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2159   if (r != CUDA_SUCCESS)
2160     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2161   return (void *) stacks;
2162 }
2163
2164 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2165
2166 static void
2167 nvptx_stacks_free (void *p, int num)
2168 {
2169   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2170   if (r != CUDA_SUCCESS)
2171     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2172 }
2173
2174 void
2175 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2176 {
2177   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2178   CUresult r;
2179   struct ptx_device *ptx_dev = ptx_devices[ord];
2180   const char *maybe_abort_msg = "(perhaps abort was called)";
2181   int teams = 0, threads = 0;
2182
2183   if (!args)
2184     GOMP_PLUGIN_fatal ("No target arguments provided");
2185   while (*args)
2186     {
2187       intptr_t id = (intptr_t) *args++, val;
2188       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2189         val = (intptr_t) *args++;
2190       else
2191         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2192       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2193         continue;
2194       val = val > INT_MAX ? INT_MAX : val;
2195       id &= GOMP_TARGET_ARG_ID_MASK;
2196       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2197         teams = val;
2198       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2199         threads = val;
2200     }
2201   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2202
2203   size_t stack_size = nvptx_stacks_size ();
2204   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2205   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2206   size_t fn_args_size = sizeof fn_args;
2207   void *config[] = {
2208     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2209     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2210     CU_LAUNCH_PARAM_END
2211   };
2212   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2213                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2214                          NULL, config);
2215   if (r != CUDA_SUCCESS)
2216     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2217
2218   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2219   if (r == CUDA_ERROR_LAUNCH_FAILED)
2220     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2221                        maybe_abort_msg);
2222   else if (r != CUDA_SUCCESS)
2223     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2224   nvptx_stacks_free (stacks, teams * threads);
2225 }
2226
2227 void
2228 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2229                         void *async_data)
2230 {
2231   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2232 }