libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if CUDA_VERSION < 6000
  53 extern CUresult cuGetErrorString (CUresult, const char **);
  54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  55 #endif
  56
  57 #if CUDA_VERSION >= 6050
  58 #undef cuLinkCreate
  59 #undef cuLinkAddData
  60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  61                         const char *, unsigned, CUjit_option *, void **);
  62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  63 #else
  64 typedef size_t (*CUoccupancyB2DSize)(int);
  65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  66                            const char *, unsigned, CUjit_option *, void **);
  67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  69                                           CUoccupancyB2DSize, size_t, int);
  70 #endif
  71
  72 #define DO_PRAGMA(x) _Pragma (#x)
  73
  74 #if PLUGIN_NVPTX_DYNAMIC
  75 # include <dlfcn.h>
  76
  77 struct cuda_lib_s {
  78
  79 # define CUDA_ONE_CALL(call)                    \
  80   __typeof (call) *call;
  81 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  82   CUDA_ONE_CALL (call)
  83 #include "cuda-lib.def"
  84 # undef CUDA_ONE_CALL
  85 # undef CUDA_ONE_CALL_MAYBE_NULL
  86
  87 } cuda_lib;
  88
  89 /* -1 if init_cuda_lib has not been called yet, false
  90    if it has been and failed, true if it has been and succeeded.  */
  91 static signed char cuda_lib_inited = -1;
  92
  93 /* Dynamically load the CUDA runtime library and initialize function
  94    pointers, return false if unsuccessful, true if successful.  */
  95 static bool
  96 init_cuda_lib (void)
  97 {
  98   if (cuda_lib_inited != -1)
  99     return cuda_lib_inited;
 100   const char *cuda_runtime_lib = "libcuda.so.1";
 101   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 102   cuda_lib_inited = false;
 103   if (h == NULL)
 104     return false;
 105
 106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 108 # define CUDA_ONE_CALL_1(call, allow_null)              \
 109   cuda_lib.call = dlsym (h, #call);     \
 110   if (!allow_null && cuda_lib.call == NULL)             \
 111     return false;
 112 #include "cuda-lib.def"
 113 # undef CUDA_ONE_CALL
 114 # undef CUDA_ONE_CALL_1
 115 # undef CUDA_ONE_CALL_MAYBE_NULL
 116
 117   cuda_lib_inited = true;
 118   return true;
 119 }
 120 # define CUDA_CALL_PREFIX cuda_lib.
 121 #else
 122
 123 # define CUDA_ONE_CALL(call)
 124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 125 #include "cuda-lib.def"
 126 #undef CUDA_ONE_CALL_MAYBE_NULL
 127 #undef CUDA_ONE_CALL
 128
 129 # define CUDA_CALL_PREFIX
 130 # define init_cuda_lib() true
 131 #endif
 132
 133 #include "secure_getenv.h"
 134
 135 #undef MIN
 136 #undef MAX
 137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 139
 140 /* Convenience macros for the frequently used CUDA library call and
 141    error handling sequence as well as CUDA library calls that
 142    do the error checking themselves or don't do it at all.  */
 143
 144 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 145   do {                                          \
 146     unsigned __r                                \
 147       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 148     if (__r != CUDA_SUCCESS)                    \
 149       {                                         \
 150         GOMP_PLUGIN_error (#FN " error: %s",    \
 151                            cuda_error (__r));   \
 152         return ERET;                            \
 153       }                                         \
 154   } while (0)
 155
 156 #define CUDA_CALL(FN, ...)                      \
 157   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 158
 159 #define CUDA_CALL_ASSERT(FN, ...)               \
 160   do {                                          \
 161     unsigned __r                                \
 162       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 163     if (__r != CUDA_SUCCESS)                    \
 164       {                                         \
 165         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 166                            cuda_error (__r));   \
 167       }                                         \
 168   } while (0)
 169
 170 #define CUDA_CALL_NOCHECK(FN, ...)              \
 171   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 172
 173 #define CUDA_CALL_EXISTS(FN)                    \
 174   CUDA_CALL_PREFIX FN
 175
 176 static const char *
 177 cuda_error (CUresult r)
 178 {
 179   const char *fallback = "unknown cuda error";
 180   const char *desc;
 181
 182   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 183     return fallback;
 184
 185   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 186   if (r == CUDA_SUCCESS)
 187     return desc;
 188
 189   return fallback;
 190 }
 191
 192 static unsigned int instantiated_devices = 0;
 193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 194
 195 struct ptx_stream
 196 {
 197   CUstream stream;
 198   pthread_t host_thread;
 199   bool multithreaded;
 200
 201   CUdeviceptr d;
 202   void *h;
 203   void *h_begin;
 204   void *h_end;
 205   void *h_next;
 206   void *h_prev;
 207   void *h_tail;
 208
 209   struct ptx_stream *next;
 210 };
 211
 212 /* Thread-specific data for PTX.  */
 213
 214 struct nvptx_thread
 215 {
 216   struct ptx_stream *current_stream;
 217   struct ptx_device *ptx_dev;
 218 };
 219
 220 static bool
 221 map_init (struct ptx_stream *s)
 222 {
 223   int size = getpagesize ();
 224
 225   assert (s);
 226   assert (!s->d);
 227   assert (!s->h);
 228
 229   CUDA_CALL (cuMemAllocHost, &s->h, size);
 230   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 231
 232   assert (s->h);
 233
 234   s->h_begin = s->h;
 235   s->h_end = s->h_begin + size;
 236   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 237
 238   assert (s->h_next);
 239   assert (s->h_end);
 240   return true;
 241 }
 242
 243 static bool
 244 map_fini (struct ptx_stream *s)
 245 {
 246   CUDA_CALL (cuMemFreeHost, s->h);
 247   return true;
 248 }
 249
 250 static void
 251 map_pop (struct ptx_stream *s)
 252 {
 253   assert (s != NULL);
 254   assert (s->h_next);
 255   assert (s->h_prev);
 256   assert (s->h_tail);
 257
 258   s->h_tail = s->h_next;
 259
 260   if (s->h_tail >= s->h_end)
 261     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 262
 263   if (s->h_next == s->h_tail)
 264     s->h_prev = s->h_next;
 265
 266   assert (s->h_next >= s->h_begin);
 267   assert (s->h_tail >= s->h_begin);
 268   assert (s->h_prev >= s->h_begin);
 269
 270   assert (s->h_next <= s->h_end);
 271   assert (s->h_tail <= s->h_end);
 272   assert (s->h_prev <= s->h_end);
 273 }
 274
 275 static void
 276 map_push (struct ptx_stream *s, size_t size, void **h, void **d)
 277 {
 278   int left;
 279   int offset;
 280
 281   assert (s != NULL);
 282
 283   left = s->h_end - s->h_next;
 284
 285   assert (s->h_prev);
 286   assert (s->h_next);
 287
 288   if (size >= left)
 289     {
 290       assert (s->h_next == s->h_prev);
 291       s->h_next = s->h_prev = s->h_tail = s->h_begin;
 292     }
 293
 294   assert (s->h_next);
 295
 296   offset = s->h_next - s->h;
 297
 298   *d = (void *)(s->d + offset);
 299   *h = (void *)(s->h + offset);
 300
 301   s->h_prev = s->h_next;
 302   s->h_next += size;
 303
 304   assert (s->h_prev);
 305   assert (s->h_next);
 306
 307   assert (s->h_next >= s->h_begin);
 308   assert (s->h_tail >= s->h_begin);
 309   assert (s->h_prev >= s->h_begin);
 310   assert (s->h_next <= s->h_end);
 311   assert (s->h_tail <= s->h_end);
 312   assert (s->h_prev <= s->h_end);
 313
 314   return;
 315 }
 316
 317 /* Target data function launch information.  */
 318
 319 struct targ_fn_launch
 320 {
 321   const char *fn;
 322   unsigned short dim[GOMP_DIM_MAX];
 323 };
 324
 325 /* Target PTX object information.  */
 326
 327 struct targ_ptx_obj
 328 {
 329   const char *code;
 330   size_t size;
 331 };
 332
 333 /* Target data image information.  */
 334
 335 typedef struct nvptx_tdata
 336 {
 337   const struct targ_ptx_obj *ptx_objs;
 338   unsigned ptx_num;
 339
 340   const char *const *var_names;
 341   unsigned var_num;
 342
 343   const struct targ_fn_launch *fn_descs;
 344   unsigned fn_num;
 345 } nvptx_tdata_t;
 346
 347 /* Descriptor of a loaded function.  */
 348
 349 struct targ_fn_descriptor
 350 {
 351   CUfunction fn;
 352   const struct targ_fn_launch *launch;
 353   int regs_per_thread;
 354   int max_threads_per_block;
 355 };
 356
 357 /* A loaded PTX image.  */
 358 struct ptx_image_data
 359 {
 360   const void *target_data;
 361   CUmodule module;
 362
 363   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 364
 365   struct ptx_image_data *next;
 366 };
 367
 368 struct ptx_device
 369 {
 370   CUcontext ctx;
 371   bool ctx_shared;
 372   CUdevice dev;
 373   struct ptx_stream *null_stream;
 374   /* All non-null streams associated with this device (actually context),
 375      either created implicitly or passed in from the user (via
 376      acc_set_cuda_stream).  */
 377   struct ptx_stream *active_streams;
 378   struct {
 379     struct ptx_stream **arr;
 380     int size;
 381   } async_streams;
 382   /* A lock for use when manipulating the above stream list and array.  */
 383   pthread_mutex_t stream_lock;
 384   int ord;
 385   bool overlap;
 386   bool map;
 387   bool concur;
 388   bool mkern;
 389   int  mode;
 390   int clock_khz;
 391   int num_sms;
 392   int regs_per_block;
 393   int regs_per_sm;
 394   int warp_size;
 395   int max_threads_per_block;
 396   int max_threads_per_multiprocessor;
 397   int default_dims[GOMP_DIM_MAX];
 398
 399   struct ptx_image_data *images;  /* Images loaded on device.  */
 400   pthread_mutex_t image_lock;     /* Lock for above list.  */
 401
 402   struct ptx_device *next;
 403 };
 404
 405 enum ptx_event_type
 406 {
 407   PTX_EVT_MEM,
 408   PTX_EVT_KNL,
 409   PTX_EVT_SYNC,
 410   PTX_EVT_ASYNC_CLEANUP
 411 };
 412
 413 struct ptx_event
 414 {
 415   CUevent *evt;
 416   int type;
 417   void *addr;
 418   int ord;
 419   int val;
 420
 421   struct ptx_event *next;
 422 };
 423
 424 static pthread_mutex_t ptx_event_lock;
 425 static struct ptx_event *ptx_events;
 426
 427 static struct ptx_device **ptx_devices;
 428
 429 static inline struct nvptx_thread *
 430 nvptx_thread (void)
 431 {
 432   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 433 }
 434
 435 static bool
 436 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 437 {
 438   int i;
 439   struct ptx_stream *null_stream
 440     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 441
 442   null_stream->stream = NULL;
 443   null_stream->host_thread = pthread_self ();
 444   null_stream->multithreaded = true;
 445   null_stream->d = (CUdeviceptr) NULL;
 446   null_stream->h = NULL;
 447   if (!map_init (null_stream))
 448     return false;
 449
 450   ptx_dev->null_stream = null_stream;
 451   ptx_dev->active_streams = NULL;
 452   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 453
 454   if (concurrency < 1)
 455     concurrency = 1;
 456
 457   /* This is just a guess -- make space for as many async streams as the
 458      current device is capable of concurrently executing.  This can grow
 459      later as necessary.  No streams are created yet.  */
 460   ptx_dev->async_streams.arr
 461     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 462   ptx_dev->async_streams.size = concurrency;
 463
 464   for (i = 0; i < concurrency; i++)
 465     ptx_dev->async_streams.arr[i] = NULL;
 466
 467   return true;
 468 }
 469
 470 static bool
 471 fini_streams_for_device (struct ptx_device *ptx_dev)
 472 {
 473   free (ptx_dev->async_streams.arr);
 474
 475   bool ret = true;
 476   while (ptx_dev->active_streams != NULL)
 477     {
 478       struct ptx_stream *s = ptx_dev->active_streams;
 479       ptx_dev->active_streams = ptx_dev->active_streams->next;
 480
 481       ret &= map_fini (s);
 482
 483       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 484       if (r != CUDA_SUCCESS)
 485         {
 486           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 487           ret = false;
 488         }
 489       free (s);
 490     }
 491
 492   ret &= map_fini (ptx_dev->null_stream);
 493   free (ptx_dev->null_stream);
 494   return ret;
 495 }
 496
 497 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 498    thread THREAD (and also current device/context).  If CREATE is true, create
 499    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 500    associate the stream with the same thread argument.  Returns stream to use
 501    as result.  */
 502
 503 static struct ptx_stream *
 504 select_stream_for_async (int async, pthread_t thread, bool create,
 505                          CUstream existing)
 506 {
 507   struct nvptx_thread *nvthd = nvptx_thread ();
 508   /* Local copy of TLS variable.  */
 509   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 510   struct ptx_stream *stream = NULL;
 511   int orig_async = async;
 512
 513   /* The special value acc_async_noval (-1) maps (for now) to an
 514      implicitly-created stream, which is then handled the same as any other
 515      numbered async stream.  Other options are available, e.g. using the null
 516      stream for anonymous async operations, or choosing an idle stream from an
 517      active set.  But, stick with this for now.  */
 518   if (async > acc_async_sync)
 519     async++;
 520
 521   if (create)
 522     pthread_mutex_lock (&ptx_dev->stream_lock);
 523
 524   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 525      null stream, and in fact better performance may be obtainable if it doesn't
 526      (because the null stream enforces overly-strict synchronisation with
 527      respect to other streams for legacy reasons, and that's probably not
 528      needed with OpenACC).  Maybe investigate later.  */
 529   if (async == acc_async_sync)
 530     stream = ptx_dev->null_stream;
 531   else if (async >= 0 && async < ptx_dev->async_streams.size
 532            && ptx_dev->async_streams.arr[async] && !(create && existing))
 533     stream = ptx_dev->async_streams.arr[async];
 534   else if (async >= 0 && create)
 535     {
 536       if (async >= ptx_dev->async_streams.size)
 537         {
 538           int i, newsize = ptx_dev->async_streams.size * 2;
 539
 540           if (async >= newsize)
 541             newsize = async + 1;
 542
 543           ptx_dev->async_streams.arr
 544             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 545                                    newsize * sizeof (struct ptx_stream *));
 546
 547           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 548             ptx_dev->async_streams.arr[i] = NULL;
 549
 550           ptx_dev->async_streams.size = newsize;
 551         }
 552
 553       /* Create a new stream on-demand if there isn't one already, or if we're
 554          setting a particular async value to an existing (externally-provided)
 555          stream.  */
 556       if (!ptx_dev->async_streams.arr[async] || existing)
 557         {
 558           CUresult r;
 559           struct ptx_stream *s
 560             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 561
 562           if (existing)
 563             s->stream = existing;
 564           else
 565             {
 566               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 567                                      CU_STREAM_DEFAULT);
 568               if (r != CUDA_SUCCESS)
 569                 {
 570                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 571                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 572                                      cuda_error (r));
 573                 }
 574             }
 575
 576           /* If CREATE is true, we're going to be queueing some work on this
 577              stream.  Associate it with the current host thread.  */
 578           s->host_thread = thread;
 579           s->multithreaded = false;
 580
 581           s->d = (CUdeviceptr) NULL;
 582           s->h = NULL;
 583           if (!map_init (s))
 584             {
 585               pthread_mutex_unlock (&ptx_dev->stream_lock);
 586               GOMP_PLUGIN_fatal ("map_init fail");
 587             }
 588
 589           s->next = ptx_dev->active_streams;
 590           ptx_dev->active_streams = s;
 591           ptx_dev->async_streams.arr[async] = s;
 592         }
 593
 594       stream = ptx_dev->async_streams.arr[async];
 595     }
 596   else if (async < 0)
 597     {
 598       if (create)
 599         pthread_mutex_unlock (&ptx_dev->stream_lock);
 600       GOMP_PLUGIN_fatal ("bad async %d", async);
 601     }
 602
 603   if (create)
 604     {
 605       assert (stream != NULL);
 606
 607       /* If we're trying to use the same stream from different threads
 608          simultaneously, set stream->multithreaded to true.  This affects the
 609          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 610          only wait for asynchronous launches from the same host thread they are
 611          invoked on.  If multiple threads use the same async value, we make note
 612          of that here and fall back to testing/waiting for all threads in those
 613          functions.  */
 614       if (thread != stream->host_thread)
 615         stream->multithreaded = true;
 616
 617       pthread_mutex_unlock (&ptx_dev->stream_lock);
 618     }
 619   else if (stream && !stream->multithreaded
 620            && !pthread_equal (stream->host_thread, thread))
 621     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 622
 623   return stream;
 624 }
 625
 626 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 627    should be locked on entry and remains locked on exit.  */
 628
 629 static bool
 630 nvptx_init (void)
 631 {
 632   int ndevs;
 633
 634   if (instantiated_devices != 0)
 635     return true;
 636
 637   ptx_events = NULL;
 638   pthread_mutex_init (&ptx_event_lock, NULL);
 639
 640   if (!init_cuda_lib ())
 641     return false;
 642
 643   CUDA_CALL (cuInit, 0);
 644
 645   CUDA_CALL (cuDeviceGetCount, &ndevs);
 646   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 647                                             * ndevs);
 648   return true;
 649 }
 650
 651 /* Select the N'th PTX device for the current host thread.  The device must
 652    have been previously opened before calling this function.  */
 653
 654 static bool
 655 nvptx_attach_host_thread_to_device (int n)
 656 {
 657   CUdevice dev;
 658   CUresult r;
 659   struct ptx_device *ptx_dev;
 660   CUcontext thd_ctx;
 661
 662   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 663   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 664     {
 665       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 666       return false;
 667     }
 668
 669   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 670     return true;
 671   else
 672     {
 673       CUcontext old_ctx;
 674
 675       ptx_dev = ptx_devices[n];
 676       if (!ptx_dev)
 677         {
 678           GOMP_PLUGIN_error ("device %d not found", n);
 679           return false;
 680         }
 681
 682       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 683
 684       /* We don't necessarily have a current context (e.g. if it has been
 685          destroyed.  Pop it if we do though.  */
 686       if (thd_ctx != NULL)
 687         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 688
 689       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 690     }
 691   return true;
 692 }
 693
 694 static struct ptx_device *
 695 nvptx_open_device (int n)
 696 {
 697   struct ptx_device *ptx_dev;
 698   CUdevice dev, ctx_dev;
 699   CUresult r;
 700   int async_engines, pi;
 701
 702   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 703
 704   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 705
 706   ptx_dev->ord = n;
 707   ptx_dev->dev = dev;
 708   ptx_dev->ctx_shared = false;
 709
 710   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 711   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 712     {
 713       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 714       return NULL;
 715     }
 716
 717   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 718     {
 719       /* The current host thread has an active context for a different device.
 720          Detach it.  */
 721       CUcontext old_ctx;
 722       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 723     }
 724
 725   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 726
 727   if (!ptx_dev->ctx)
 728     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 729   else
 730     ptx_dev->ctx_shared = true;
 731
 732   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 733                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 734   ptx_dev->overlap = pi;
 735
 736   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 737                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 738   ptx_dev->map = pi;
 739
 740   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 741                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 742   ptx_dev->concur = pi;
 743
 744   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 745                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 746   ptx_dev->mode = pi;
 747
 748   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 749                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 750   ptx_dev->mkern = pi;
 751
 752   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 753                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 754   ptx_dev->clock_khz = pi;
 755
 756   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 757                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 758   ptx_dev->num_sms = pi;
 759
 760   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 761                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 762   ptx_dev->regs_per_block = pi;
 763
 764   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 765      in CUDA 6.0 and newer.  */
 766   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 767                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 768                          dev);
 769   /* Fallback: use limit of registers per block, which is usually equal.  */
 770   if (r == CUDA_ERROR_INVALID_VALUE)
 771     pi = ptx_dev->regs_per_block;
 772   else if (r != CUDA_SUCCESS)
 773     {
 774       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 775       return NULL;
 776     }
 777   ptx_dev->regs_per_sm = pi;
 778
 779   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 780                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 781   if (pi != 32)
 782     {
 783       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 784       return NULL;
 785     }
 786   ptx_dev->warp_size = pi;
 787
 788   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 789                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 790   ptx_dev->max_threads_per_block = pi;
 791
 792   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 793                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 794   ptx_dev->max_threads_per_multiprocessor = pi;
 795
 796   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 797                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 798   if (r != CUDA_SUCCESS)
 799     async_engines = 1;
 800
 801   for (int i = 0; i != GOMP_DIM_MAX; i++)
 802     ptx_dev->default_dims[i] = 0;
 803
 804   ptx_dev->images = NULL;
 805   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 806
 807   if (!init_streams_for_device (ptx_dev, async_engines))
 808     return NULL;
 809
 810   return ptx_dev;
 811 }
 812
 813 static bool
 814 nvptx_close_device (struct ptx_device *ptx_dev)
 815 {
 816   if (!ptx_dev)
 817     return true;
 818
 819   if (!fini_streams_for_device (ptx_dev))
 820     return false;
 821
 822   pthread_mutex_destroy (&ptx_dev->image_lock);
 823
 824   if (!ptx_dev->ctx_shared)
 825     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 826
 827   free (ptx_dev);
 828   return true;
 829 }
 830
 831 static int
 832 nvptx_get_num_devices (void)
 833 {
 834   int n;
 835
 836   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 837      configurations.  */
 838   if (sizeof (void *) != 8)
 839     {
 840       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 841                          " only 64-bit configurations are supported\n");
 842       return 0;
 843     }
 844
 845   /* This function will be called before the plugin has been initialized in
 846      order to enumerate available devices, but CUDA API routines can't be used
 847      until cuInit has been called.  Just call it now (but don't yet do any
 848      further initialization).  */
 849   if (instantiated_devices == 0)
 850     {
 851       if (!init_cuda_lib ())
 852         return 0;
 853       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 854       /* This is not an error: e.g. we may have CUDA libraries installed but
 855          no devices available.  */
 856       if (r != CUDA_SUCCESS)
 857         {
 858           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 859                              cuda_error (r));
 860           return 0;
 861         }
 862     }
 863
 864   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 865   return n;
 866 }
 867
 868 static void
 869 notify_var (const char *var_name, const char *env_var)
 870 {
 871   if (env_var == NULL)
 872     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 873   else
 874     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 875 }
 876
 877 static void
 878 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 879 {
 880   const char *var_name = "GOMP_NVPTX_JIT";
 881   const char *env_var = secure_getenv (var_name);
 882   notify_var (var_name, env_var);
 883
 884   if (env_var == NULL)
 885     return;
 886
 887   const char *c = env_var;
 888   while (*c != '\0')
 889     {
 890       while (*c == ' ')
 891         c++;
 892
 893       if (c[0] == '-' && c[1] == 'O'
 894           && '0' <= c[2] && c[2] <= '4'
 895           && (c[3] == '\0' || c[3] == ' '))
 896         {
 897           *gomp_nvptx_o = c[2] - '0';
 898           c += 3;
 899           continue;
 900         }
 901
 902       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 903       break;
 904     }
 905 }
 906
 907 static bool
 908 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 909           unsigned num_objs)
 910 {
 911   CUjit_option opts[7];
 912   void *optvals[7];
 913   float elapsed = 0.0;
 914   char elog[1024];
 915   char ilog[16384];
 916   CUlinkState linkstate;
 917   CUresult r;
 918   void *linkout;
 919   size_t linkoutsize __attribute__ ((unused));
 920
 921   opts[0] = CU_JIT_WALL_TIME;
 922   optvals[0] = &elapsed;
 923
 924   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 925   optvals[1] = &ilog[0];
 926
 927   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 928   optvals[2] = (void *) sizeof ilog;
 929
 930   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 931   optvals[3] = &elog[0];
 932
 933   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 934   optvals[4] = (void *) sizeof elog;
 935
 936   opts[5] = CU_JIT_LOG_VERBOSE;
 937   optvals[5] = (void *) 1;
 938
 939   static intptr_t gomp_nvptx_o = -1;
 940
 941   static bool init_done = false;
 942   if (!init_done)
 943     {
 944       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 945       init_done = true;
 946   }
 947
 948   int nopts = 6;
 949   if (gomp_nvptx_o != -1)
 950     {
 951       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 952       optvals[nopts] = (void *) gomp_nvptx_o;
 953       nopts++;
 954     }
 955
 956   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 957     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 958   else
 959     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 960
 961   for (; num_objs--; ptx_objs++)
 962     {
 963       /* cuLinkAddData's 'data' argument erroneously omits the const
 964          qualifier.  */
 965       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 966       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 967         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 968                                (char *) ptx_objs->code, ptx_objs->size,
 969                                0, 0, 0, 0);
 970       else
 971         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 972                                (char *) ptx_objs->code, ptx_objs->size,
 973                                0, 0, 0, 0);
 974       if (r != CUDA_SUCCESS)
 975         {
 976           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 977           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 978                              cuda_error (r));
 979           return false;
 980         }
 981     }
 982
 983   GOMP_PLUGIN_debug (0, "Linking\n");
 984   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 985
 986   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 987   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 988
 989   if (r != CUDA_SUCCESS)
 990     {
 991       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 992       return false;
 993     }
 994
 995   CUDA_CALL (cuModuleLoadData, module, linkout);
 996   CUDA_CALL (cuLinkDestroy, linkstate);
 997   return true;
 998 }
 999
1000 static void
1001 event_gc (bool memmap_lockable)
1002 {
1003   struct ptx_event *ptx_event = ptx_events;
1004   struct ptx_event *async_cleanups = NULL;
1005   struct nvptx_thread *nvthd = nvptx_thread ();
1006
1007   pthread_mutex_lock (&ptx_event_lock);
1008
1009   while (ptx_event != NULL)
1010     {
1011       CUresult r;
1012       struct ptx_event *e = ptx_event;
1013
1014       ptx_event = ptx_event->next;
1015
1016       if (e->ord != nvthd->ptx_dev->ord)
1017         continue;
1018
1019       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1020       if (r == CUDA_SUCCESS)
1021         {
1022           bool append_async = false;
1023           CUevent *te;
1024
1025           te = e->evt;
1026
1027           switch (e->type)
1028             {
1029             case PTX_EVT_MEM:
1030             case PTX_EVT_SYNC:
1031               break;
1032
1033             case PTX_EVT_KNL:
1034               map_pop (e->addr);
1035               break;
1036
1037             case PTX_EVT_ASYNC_CLEANUP:
1038               {
1039                 /* The function gomp_plugin_async_unmap_vars needs to claim the
1040                    memory-map splay tree lock for the current device, so we
1041                    can't call it when one of our callers has already claimed
1042                    the lock.  In that case, just delay the GC for this event
1043                    until later.  */
1044                 if (!memmap_lockable)
1045                   continue;
1046
1047                 append_async = true;
1048               }
1049               break;
1050             }
1051
1052           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1053           free ((void *)te);
1054
1055           /* Unlink 'e' from ptx_events list.  */
1056           if (ptx_events == e)
1057             ptx_events = ptx_events->next;
1058           else
1059             {
1060               struct ptx_event *e_ = ptx_events;
1061               while (e_->next != e)
1062                 e_ = e_->next;
1063               e_->next = e_->next->next;
1064             }
1065
1066           if (append_async)
1067             {
1068               e->next = async_cleanups;
1069               async_cleanups = e;
1070             }
1071           else
1072             free (e);
1073         }
1074     }
1075
1076   pthread_mutex_unlock (&ptx_event_lock);
1077
1078   /* We have to do these here, after ptx_event_lock is released.  */
1079   while (async_cleanups)
1080     {
1081       struct ptx_event *e = async_cleanups;
1082       async_cleanups = async_cleanups->next;
1083
1084       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1085       free (e);
1086     }
1087 }
1088
1089 static void
1090 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1091 {
1092   struct ptx_event *ptx_event;
1093   struct nvptx_thread *nvthd = nvptx_thread ();
1094
1095   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1096           || type == PTX_EVT_ASYNC_CLEANUP);
1097
1098   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1099   ptx_event->type = type;
1100   ptx_event->evt = e;
1101   ptx_event->addr = h;
1102   ptx_event->ord = nvthd->ptx_dev->ord;
1103   ptx_event->val = val;
1104
1105   pthread_mutex_lock (&ptx_event_lock);
1106
1107   ptx_event->next = ptx_events;
1108   ptx_events = ptx_event;
1109
1110   pthread_mutex_unlock (&ptx_event_lock);
1111 }
1112
1113 static void
1114 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1115             int async, unsigned *dims, void *targ_mem_desc)
1116 {
1117   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1118   CUfunction function;
1119   CUresult r;
1120   int i;
1121   struct ptx_stream *dev_str;
1122   void *kargs[1];
1123   void *hp, *dp;
1124   struct nvptx_thread *nvthd = nvptx_thread ();
1125   int warp_size = nvthd->ptx_dev->warp_size;
1126   const char *maybe_abort_msg = "(perhaps abort was called)";
1127
1128   function = targ_fn->fn;
1129
1130   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1131   assert (dev_str == nvthd->current_stream);
1132
1133   /* Initialize the launch dimensions.  Typically this is constant,
1134      provided by the device compiler, but we must permit runtime
1135      values.  */
1136   int seen_zero = 0;
1137   for (i = 0; i != GOMP_DIM_MAX; i++)
1138     {
1139       if (targ_fn->launch->dim[i])
1140        dims[i] = targ_fn->launch->dim[i];
1141       if (!dims[i])
1142        seen_zero = 1;
1143     }
1144
1145   if (seen_zero)
1146     {
1147       pthread_mutex_lock (&ptx_dev_lock);
1148
1149       static int gomp_openacc_dims[GOMP_DIM_MAX];
1150       if (!gomp_openacc_dims[0])
1151         {
1152           /* See if the user provided GOMP_OPENACC_DIM environment
1153              variable to specify runtime defaults.  */
1154           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1155             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1156         }
1157
1158       if (!nvthd->ptx_dev->default_dims[0])
1159         {
1160           int default_dims[GOMP_DIM_MAX];
1161           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1162             default_dims[i] = gomp_openacc_dims[i];
1163
1164           int gang, worker, vector;
1165           {
1166             int block_size = nvthd->ptx_dev->max_threads_per_block;
1167             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1168             int dev_size = nvthd->ptx_dev->num_sms;
1169             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1170                                " dev_size=%d, cpu_size=%d\n",
1171                                warp_size, block_size, dev_size, cpu_size);
1172
1173             gang = (cpu_size / block_size) * dev_size;
1174             worker = block_size / warp_size;
1175             vector = warp_size;
1176           }
1177
1178           /* There is no upper bound on the gang size.  The best size
1179              matches the hardware configuration.  Logical gangs are
1180              scheduled onto physical hardware.  To maximize usage, we
1181              should guess a large number.  */
1182           if (default_dims[GOMP_DIM_GANG] < 1)
1183             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1184           /* The worker size must not exceed the hardware.  */
1185           if (default_dims[GOMP_DIM_WORKER] < 1
1186               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1187             default_dims[GOMP_DIM_WORKER] = worker;
1188           /* The vector size must exactly match the hardware.  */
1189           if (default_dims[GOMP_DIM_VECTOR] < 1
1190               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1191             default_dims[GOMP_DIM_VECTOR] = vector;
1192
1193           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1194                              default_dims[GOMP_DIM_GANG],
1195                              default_dims[GOMP_DIM_WORKER],
1196                              default_dims[GOMP_DIM_VECTOR]);
1197
1198           for (i = 0; i != GOMP_DIM_MAX; i++)
1199             nvthd->ptx_dev->default_dims[i] = default_dims[i];
1200         }
1201       pthread_mutex_unlock (&ptx_dev_lock);
1202
1203       {
1204         bool default_dim_p[GOMP_DIM_MAX];
1205         for (i = 0; i != GOMP_DIM_MAX; i++)
1206           default_dim_p[i] = !dims[i];
1207
1208         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
1209           {
1210             for (i = 0; i != GOMP_DIM_MAX; i++)
1211               if (default_dim_p[i])
1212                 dims[i] = nvthd->ptx_dev->default_dims[i];
1213
1214             if (default_dim_p[GOMP_DIM_VECTOR])
1215               dims[GOMP_DIM_VECTOR]
1216                 = MIN (dims[GOMP_DIM_VECTOR],
1217                        (targ_fn->max_threads_per_block / warp_size
1218                         * warp_size));
1219
1220             if (default_dim_p[GOMP_DIM_WORKER])
1221               dims[GOMP_DIM_WORKER]
1222                 = MIN (dims[GOMP_DIM_WORKER],
1223                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1224           }
1225         else
1226           {
1227             /* Handle the case that the compiler allows the runtime to choose
1228                the vector-length conservatively, by ignoring
1229                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
1230                it.  */
1231             int vectors = 0;
1232             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1233                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1234                exceed targ_fn->max_threads_per_block. */
1235             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
1236             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
1237             int grids, blocks;
1238
1239             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
1240                               &blocks, function, NULL, 0,
1241                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
1242             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1243                                "grid = %d, block = %d\n", grids, blocks);
1244
1245             /* Keep the num_gangs proportional to the block size.  In
1246                the case were a block size is limited by shared-memory
1247                or the register file capacity, the runtime will not
1248                excessively over assign gangs to the multiprocessor
1249                units if their state is going to be swapped out even
1250                more than necessary. The constant factor 2 is there to
1251                prevent threads from idling when there is insufficient
1252                work for them.  */
1253             if (gangs == 0)
1254               gangs = 2 * grids * (blocks / warp_size);
1255
1256             if (vectors == 0)
1257               vectors = warp_size;
1258
1259             if (workers == 0)
1260               {
1261                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
1262                                       ? vectors
1263                                       : dims[GOMP_DIM_VECTOR]);
1264                 workers = blocks / actual_vectors;
1265               }
1266
1267             for (i = 0; i != GOMP_DIM_MAX; i++)
1268               if (default_dim_p[i])
1269                 switch (i)
1270                   {
1271                   case GOMP_DIM_GANG: dims[i] = gangs; break;
1272                   case GOMP_DIM_WORKER: dims[i] = workers; break;
1273                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
1274                   default: GOMP_PLUGIN_fatal ("invalid dim");
1275                   }
1276           }
1277       }
1278     }
1279
1280   /* Check if the accelerator has sufficient hardware resources to
1281      launch the offloaded kernel.  */
1282   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1283       > targ_fn->max_threads_per_block)
1284     {
1285       int suggest_workers
1286         = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1287       GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1288                          " launch '%s' with num_workers = %d; recompile the"
1289                          " program with 'num_workers = %d' on that offloaded"
1290                          " region or '-fopenacc-dim=:%d'",
1291                          targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1292                          suggest_workers, suggest_workers);
1293     }
1294
1295   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1296      the host and the device. HP is a host pointer to the new chunk, and DP is
1297      the corresponding device pointer.  */
1298   map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
1299
1300   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1301
1302   /* Copy the array of arguments to the mapped page.  */
1303   for (i = 0; i < mapnum; i++)
1304     ((void **) hp)[i] = devaddrs[i];
1305
1306   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1307      fact have the same value on a unified-memory system).  */
1308   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1309                     mapnum * sizeof (void *));
1310   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1311                      " gangs=%u, workers=%u, vectors=%u\n",
1312                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1313                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1314
1315   // OpenACC            CUDA
1316   //
1317   // num_gangs          nctaid.x
1318   // num_workers        ntid.y
1319   // vector length      ntid.x
1320
1321   kargs[0] = &dp;
1322   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1323                     dims[GOMP_DIM_GANG], 1, 1,
1324                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1325                     0, dev_str->stream, kargs, 0);
1326
1327 #ifndef DISABLE_ASYNC
1328   if (async < acc_async_noval)
1329     {
1330       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1331       if (r == CUDA_ERROR_LAUNCH_FAILED)
1332         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1333                            maybe_abort_msg);
1334       else if (r != CUDA_SUCCESS)
1335         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1336     }
1337   else
1338     {
1339       CUevent *e;
1340
1341       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1342
1343       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1344       if (r == CUDA_ERROR_LAUNCH_FAILED)
1345         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1346                            maybe_abort_msg);
1347       else if (r != CUDA_SUCCESS)
1348         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1349
1350       event_gc (true);
1351
1352       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1353
1354       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1355     }
1356 #else
1357   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1358   if (r == CUDA_ERROR_LAUNCH_FAILED)
1359     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1360                        maybe_abort_msg);
1361   else if (r != CUDA_SUCCESS)
1362     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1363 #endif
1364
1365   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1366                      targ_fn->launch->fn);
1367
1368 #ifndef DISABLE_ASYNC
1369   if (async < acc_async_noval)
1370 #endif
1371     map_pop (dev_str);
1372 }
1373
1374 void * openacc_get_current_cuda_context (void);
1375
1376 static void *
1377 nvptx_alloc (size_t s)
1378 {
1379   CUdeviceptr d;
1380
1381   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1382   return (void *) d;
1383 }
1384
1385 static bool
1386 nvptx_free (void *p)
1387 {
1388   CUdeviceptr pb;
1389   size_t ps;
1390
1391   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1392   if ((CUdeviceptr) p != pb)
1393     {
1394       GOMP_PLUGIN_error ("invalid device address");
1395       return false;
1396     }
1397
1398   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1399   return true;
1400 }
1401
1402
1403 static bool
1404 nvptx_host2dev (void *d, const void *h, size_t s)
1405 {
1406   CUdeviceptr pb;
1407   size_t ps;
1408   struct nvptx_thread *nvthd = nvptx_thread ();
1409
1410   if (!s)
1411     return true;
1412   if (!d)
1413     {
1414       GOMP_PLUGIN_error ("invalid device address");
1415       return false;
1416     }
1417
1418   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1419
1420   if (!pb)
1421     {
1422       GOMP_PLUGIN_error ("invalid device address");
1423       return false;
1424     }
1425   if (!h)
1426     {
1427       GOMP_PLUGIN_error ("invalid host address");
1428       return false;
1429     }
1430   if (d == h)
1431     {
1432       GOMP_PLUGIN_error ("invalid host or device address");
1433       return false;
1434     }
1435   if ((void *)(d + s) > (void *)(pb + ps))
1436     {
1437       GOMP_PLUGIN_error ("invalid size");
1438       return false;
1439     }
1440
1441 #ifndef DISABLE_ASYNC
1442   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1443     {
1444       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1445       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1446       event_gc (false);
1447       CUDA_CALL (cuMemcpyHtoDAsync,
1448                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1449       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1450       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1451     }
1452   else
1453 #endif
1454     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1455
1456   return true;
1457 }
1458
1459 static bool
1460 nvptx_dev2host (void *h, const void *d, size_t s)
1461 {
1462   CUdeviceptr pb;
1463   size_t ps;
1464   struct nvptx_thread *nvthd = nvptx_thread ();
1465
1466   if (!s)
1467     return true;
1468   if (!d)
1469     {
1470       GOMP_PLUGIN_error ("invalid device address");
1471       return false;
1472     }
1473
1474   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1475
1476   if (!pb)
1477     {
1478       GOMP_PLUGIN_error ("invalid device address");
1479       return false;
1480     }
1481   if (!h)
1482     {
1483       GOMP_PLUGIN_error ("invalid host address");
1484       return false;
1485     }
1486   if (d == h)
1487     {
1488       GOMP_PLUGIN_error ("invalid host or device address");
1489       return false;
1490     }
1491   if ((void *)(d + s) > (void *)(pb + ps))
1492     {
1493       GOMP_PLUGIN_error ("invalid size");
1494       return false;
1495     }
1496
1497 #ifndef DISABLE_ASYNC
1498   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1499     {
1500       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1501       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1502       event_gc (false);
1503       CUDA_CALL (cuMemcpyDtoHAsync,
1504                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1505       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1506       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1507     }
1508   else
1509 #endif
1510     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1511
1512   return true;
1513 }
1514
1515 static void
1516 nvptx_set_async (int async)
1517 {
1518   struct nvptx_thread *nvthd = nvptx_thread ();
1519   nvthd->current_stream
1520     = select_stream_for_async (async, pthread_self (), true, NULL);
1521 }
1522
1523 static int
1524 nvptx_async_test (int async)
1525 {
1526   CUresult r;
1527   struct ptx_stream *s;
1528
1529   s = select_stream_for_async (async, pthread_self (), false, NULL);
1530
1531   if (!s)
1532     GOMP_PLUGIN_fatal ("unknown async %d", async);
1533
1534   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1535   if (r == CUDA_SUCCESS)
1536     {
1537       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1538          whether all work has completed on this stream, and if so omits the call
1539          to the wait hook.  If that happens, event_gc might not get called
1540          (which prevents variables from getting unmapped and their associated
1541          device storage freed), so call it here.  */
1542       event_gc (true);
1543       return 1;
1544     }
1545   else if (r == CUDA_ERROR_NOT_READY)
1546     return 0;
1547
1548   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1549
1550   return 0;
1551 }
1552
1553 static int
1554 nvptx_async_test_all (void)
1555 {
1556   struct ptx_stream *s;
1557   pthread_t self = pthread_self ();
1558   struct nvptx_thread *nvthd = nvptx_thread ();
1559
1560   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1561
1562   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1563     {
1564       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1565           && CUDA_CALL_NOCHECK (cuStreamQuery,
1566                                 s->stream) == CUDA_ERROR_NOT_READY)
1567         {
1568           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569           return 0;
1570         }
1571     }
1572
1573   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1574
1575   event_gc (true);
1576
1577   return 1;
1578 }
1579
1580 static void
1581 nvptx_wait (int async)
1582 {
1583   struct ptx_stream *s;
1584
1585   s = select_stream_for_async (async, pthread_self (), false, NULL);
1586   if (!s)
1587     GOMP_PLUGIN_fatal ("unknown async %d", async);
1588
1589   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1590
1591   event_gc (true);
1592 }
1593
1594 static void
1595 nvptx_wait_async (int async1, int async2)
1596 {
1597   CUevent *e;
1598   struct ptx_stream *s1, *s2;
1599   pthread_t self = pthread_self ();
1600
1601   /* The stream that is waiting (rather than being waited for) doesn't
1602      necessarily have to exist already.  */
1603   s2 = select_stream_for_async (async2, self, true, NULL);
1604
1605   s1 = select_stream_for_async (async1, self, false, NULL);
1606   if (!s1)
1607     GOMP_PLUGIN_fatal ("invalid async 1\n");
1608
1609   if (s1 == s2)
1610     GOMP_PLUGIN_fatal ("identical parameters");
1611
1612   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1613
1614   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1615
1616   event_gc (true);
1617
1618   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1619
1620   event_add (PTX_EVT_SYNC, e, NULL, 0);
1621
1622   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1623 }
1624
1625 static void
1626 nvptx_wait_all (void)
1627 {
1628   CUresult r;
1629   struct ptx_stream *s;
1630   pthread_t self = pthread_self ();
1631   struct nvptx_thread *nvthd = nvptx_thread ();
1632
1633   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1634
1635   /* Wait for active streams initiated by this thread (or by multiple threads)
1636      to complete.  */
1637   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1638     {
1639       if (s->multithreaded || pthread_equal (s->host_thread, self))
1640         {
1641           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1642           if (r == CUDA_SUCCESS)
1643             continue;
1644           else if (r != CUDA_ERROR_NOT_READY)
1645             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1646
1647           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1648         }
1649     }
1650
1651   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1652
1653   event_gc (true);
1654 }
1655
1656 static void
1657 nvptx_wait_all_async (int async)
1658 {
1659   struct ptx_stream *waiting_stream, *other_stream;
1660   CUevent *e;
1661   struct nvptx_thread *nvthd = nvptx_thread ();
1662   pthread_t self = pthread_self ();
1663
1664   /* The stream doing the waiting.  This could be the first mention of the
1665      stream, so create it if necessary.  */
1666   waiting_stream
1667     = select_stream_for_async (async, pthread_self (), true, NULL);
1668
1669   /* Launches on the null stream already block on other streams in the
1670      context.  */
1671   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1672     return;
1673
1674   event_gc (true);
1675
1676   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1677
1678   for (other_stream = nvthd->ptx_dev->active_streams;
1679        other_stream != NULL;
1680        other_stream = other_stream->next)
1681     {
1682       if (!other_stream->multithreaded
1683           && !pthread_equal (other_stream->host_thread, self))
1684         continue;
1685
1686       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1687
1688       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1689
1690       /* Record an event on the waited-for stream.  */
1691       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1692
1693       event_add (PTX_EVT_SYNC, e, NULL, 0);
1694
1695       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1696    }
1697
1698   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1699 }
1700
1701 static void *
1702 nvptx_get_current_cuda_device (void)
1703 {
1704   struct nvptx_thread *nvthd = nvptx_thread ();
1705
1706   if (!nvthd || !nvthd->ptx_dev)
1707     return NULL;
1708
1709   return &nvthd->ptx_dev->dev;
1710 }
1711
1712 static void *
1713 nvptx_get_current_cuda_context (void)
1714 {
1715   struct nvptx_thread *nvthd = nvptx_thread ();
1716
1717   if (!nvthd || !nvthd->ptx_dev)
1718     return NULL;
1719
1720   return nvthd->ptx_dev->ctx;
1721 }
1722
1723 static void *
1724 nvptx_get_cuda_stream (int async)
1725 {
1726   struct ptx_stream *s;
1727   struct nvptx_thread *nvthd = nvptx_thread ();
1728
1729   if (!nvthd || !nvthd->ptx_dev)
1730     return NULL;
1731
1732   s = select_stream_for_async (async, pthread_self (), false, NULL);
1733
1734   return s ? s->stream : NULL;
1735 }
1736
1737 static int
1738 nvptx_set_cuda_stream (int async, void *stream)
1739 {
1740   struct ptx_stream *oldstream;
1741   pthread_t self = pthread_self ();
1742   struct nvptx_thread *nvthd = nvptx_thread ();
1743
1744   if (async < 0)
1745     GOMP_PLUGIN_fatal ("bad async %d", async);
1746
1747   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1748
1749   /* We have a list of active streams and an array mapping async values to
1750      entries of that list.  We need to take "ownership" of the passed-in stream,
1751      and add it to our list, removing the previous entry also (if there was one)
1752      in order to prevent resource leaks.  Note the potential for surprise
1753      here: maybe we should keep track of passed-in streams and leave it up to
1754      the user to tidy those up, but that doesn't work for stream handles
1755      returned from acc_get_cuda_stream above...  */
1756
1757   oldstream = select_stream_for_async (async, self, false, NULL);
1758
1759   if (oldstream)
1760     {
1761       if (nvthd->ptx_dev->active_streams == oldstream)
1762         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1763       else
1764         {
1765           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1766           while (s->next != oldstream)
1767             s = s->next;
1768           s->next = s->next->next;
1769         }
1770
1771       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1772
1773       if (!map_fini (oldstream))
1774         GOMP_PLUGIN_fatal ("error when freeing host memory");
1775
1776       free (oldstream);
1777     }
1778
1779   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1780
1781   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1782
1783   return 1;
1784 }
1785
1786 /* Plugin entry points.  */
1787
1788 const char *
1789 GOMP_OFFLOAD_get_name (void)
1790 {
1791   return "nvptx";
1792 }
1793
1794 unsigned int
1795 GOMP_OFFLOAD_get_caps (void)
1796 {
1797   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1798 }
1799
1800 int
1801 GOMP_OFFLOAD_get_type (void)
1802 {
1803   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1804 }
1805
1806 int
1807 GOMP_OFFLOAD_get_num_devices (void)
1808 {
1809   return nvptx_get_num_devices ();
1810 }
1811
1812 bool
1813 GOMP_OFFLOAD_init_device (int n)
1814 {
1815   struct ptx_device *dev;
1816
1817   pthread_mutex_lock (&ptx_dev_lock);
1818
1819   if (!nvptx_init () || ptx_devices[n] != NULL)
1820     {
1821       pthread_mutex_unlock (&ptx_dev_lock);
1822       return false;
1823     }
1824
1825   dev = nvptx_open_device (n);
1826   if (dev)
1827     {
1828       ptx_devices[n] = dev;
1829       instantiated_devices++;
1830     }
1831
1832   pthread_mutex_unlock (&ptx_dev_lock);
1833
1834   return dev != NULL;
1835 }
1836
1837 bool
1838 GOMP_OFFLOAD_fini_device (int n)
1839 {
1840   pthread_mutex_lock (&ptx_dev_lock);
1841
1842   if (ptx_devices[n] != NULL)
1843     {
1844       if (!nvptx_attach_host_thread_to_device (n)
1845           || !nvptx_close_device (ptx_devices[n]))
1846         {
1847           pthread_mutex_unlock (&ptx_dev_lock);
1848           return false;
1849         }
1850       ptx_devices[n] = NULL;
1851       instantiated_devices--;
1852     }
1853
1854   pthread_mutex_unlock (&ptx_dev_lock);
1855   return true;
1856 }
1857
1858 /* Return the libgomp version number we're compatible with.  There is
1859    no requirement for cross-version compatibility.  */
1860
1861 unsigned
1862 GOMP_OFFLOAD_version (void)
1863 {
1864   return GOMP_VERSION;
1865 }
1866
1867 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1868
1869 static void
1870 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1871 {
1872   CUdeviceptr dptr;
1873   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1874                                   module, "__nvptx_clocktick");
1875   if (r == CUDA_ERROR_NOT_FOUND)
1876     return;
1877   if (r != CUDA_SUCCESS)
1878     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1879   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1880   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1881                          sizeof (__nvptx_clocktick));
1882   if (r != CUDA_SUCCESS)
1883     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1884 }
1885
1886 /* Load the (partial) program described by TARGET_DATA to device
1887    number ORD.  Allocate and return TARGET_TABLE.  */
1888
1889 int
1890 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1891                          struct addr_pair **target_table)
1892 {
1893   CUmodule module;
1894   const char *const *var_names;
1895   const struct targ_fn_launch *fn_descs;
1896   unsigned int fn_entries, var_entries, i, j;
1897   struct targ_fn_descriptor *targ_fns;
1898   struct addr_pair *targ_tbl;
1899   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1900   struct ptx_image_data *new_image;
1901   struct ptx_device *dev;
1902
1903   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1904     {
1905       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906                          " (expected %u, received %u)",
1907                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1908       return -1;
1909     }
1910
1911   if (!nvptx_attach_host_thread_to_device (ord)
1912       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1913     return -1;
1914
1915   dev = ptx_devices[ord];
1916
1917   /* The mkoffload utility emits a struct of pointers/integers at the
1918      start of each offload image.  The array of kernel names and the
1919      functions addresses form a one-to-one correspondence.  */
1920
1921   var_entries = img_header->var_num;
1922   var_names = img_header->var_names;
1923   fn_entries = img_header->fn_num;
1924   fn_descs = img_header->fn_descs;
1925
1926   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1927                                  * (fn_entries + var_entries));
1928   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1929                                  * fn_entries);
1930
1931   *target_table = targ_tbl;
1932
1933   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1934   new_image->target_data = target_data;
1935   new_image->module = module;
1936   new_image->fns = targ_fns;
1937
1938   pthread_mutex_lock (&dev->image_lock);
1939   new_image->next = dev->images;
1940   dev->images = new_image;
1941   pthread_mutex_unlock (&dev->image_lock);
1942
1943   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1944     {
1945       CUfunction function;
1946       int nregs, mthrs;
1947
1948       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1949                       fn_descs[i].fn);
1950       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1951                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1952       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1953                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1954
1955       targ_fns->fn = function;
1956       targ_fns->launch = &fn_descs[i];
1957       targ_fns->regs_per_thread = nregs;
1958       targ_fns->max_threads_per_block = mthrs;
1959
1960       targ_tbl->start = (uintptr_t) targ_fns;
1961       targ_tbl->end = targ_tbl->start + 1;
1962     }
1963
1964   for (j = 0; j < var_entries; j++, targ_tbl++)
1965     {
1966       CUdeviceptr var;
1967       size_t bytes;
1968
1969       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1970                       &var, &bytes, module, var_names[j]);
1971
1972       targ_tbl->start = (uintptr_t) var;
1973       targ_tbl->end = targ_tbl->start + bytes;
1974     }
1975
1976   nvptx_set_clocktick (module, dev);
1977
1978   return fn_entries + var_entries;
1979 }
1980
1981 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1982    function descriptors allocated by G_O_load_image.  */
1983
1984 bool
1985 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1986 {
1987   struct ptx_image_data *image, **prev_p;
1988   struct ptx_device *dev = ptx_devices[ord];
1989
1990   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1991     {
1992       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1993                          " (expected %u, received %u)",
1994                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1995       return false;
1996     }
1997
1998   bool ret = true;
1999   pthread_mutex_lock (&dev->image_lock);
2000   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
2001     if (image->target_data == target_data)
2002       {
2003         *prev_p = image->next;
2004         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
2005           ret = false;
2006         free (image->fns);
2007         free (image);
2008         break;
2009       }
2010   pthread_mutex_unlock (&dev->image_lock);
2011   return ret;
2012 }
2013
2014 void *
2015 GOMP_OFFLOAD_alloc (int ord, size_t size)
2016 {
2017   if (!nvptx_attach_host_thread_to_device (ord))
2018     return NULL;
2019   return nvptx_alloc (size);
2020 }
2021
2022 bool
2023 GOMP_OFFLOAD_free (int ord, void *ptr)
2024 {
2025   return (nvptx_attach_host_thread_to_device (ord)
2026           && nvptx_free (ptr));
2027 }
2028
2029 bool
2030 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2031 {
2032   return (nvptx_attach_host_thread_to_device (ord)
2033           && nvptx_dev2host (dst, src, n));
2034 }
2035
2036 bool
2037 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2038 {
2039   return (nvptx_attach_host_thread_to_device (ord)
2040           && nvptx_host2dev (dst, src, n));
2041 }
2042
2043 bool
2044 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2045 {
2046   struct ptx_device *ptx_dev = ptx_devices[ord];
2047   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
2048                                 ptx_dev->null_stream->stream);
2049   return true;
2050 }
2051
2052 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
2053
2054 void
2055 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
2056                            void **hostaddrs, void **devaddrs,
2057                            int async, unsigned *dims, void *targ_mem_desc)
2058 {
2059   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2060 }
2061
2062 void
2063 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2064 {
2065   struct nvptx_thread *nvthd = nvptx_thread ();
2066   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2067
2068   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2069   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2070   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2071 }
2072
2073 int
2074 GOMP_OFFLOAD_openacc_async_test (int async)
2075 {
2076   return nvptx_async_test (async);
2077 }
2078
2079 int
2080 GOMP_OFFLOAD_openacc_async_test_all (void)
2081 {
2082   return nvptx_async_test_all ();
2083 }
2084
2085 void
2086 GOMP_OFFLOAD_openacc_async_wait (int async)
2087 {
2088   nvptx_wait (async);
2089 }
2090
2091 void
2092 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2093 {
2094   nvptx_wait_async (async1, async2);
2095 }
2096
2097 void
2098 GOMP_OFFLOAD_openacc_async_wait_all (void)
2099 {
2100   nvptx_wait_all ();
2101 }
2102
2103 void
2104 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2105 {
2106   nvptx_wait_all_async (async);
2107 }
2108
2109 void
2110 GOMP_OFFLOAD_openacc_async_set_async (int async)
2111 {
2112   nvptx_set_async (async);
2113 }
2114
2115 void *
2116 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2117 {
2118   struct ptx_device *ptx_dev;
2119   struct nvptx_thread *nvthd
2120     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2121   CUcontext thd_ctx;
2122
2123   ptx_dev = ptx_devices[ord];
2124
2125   assert (ptx_dev);
2126
2127   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2128
2129   assert (ptx_dev->ctx);
2130
2131   if (!thd_ctx)
2132     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2133
2134   nvthd->current_stream = ptx_dev->null_stream;
2135   nvthd->ptx_dev = ptx_dev;
2136
2137   return (void *) nvthd;
2138 }
2139
2140 void
2141 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2142 {
2143   free (data);
2144 }
2145
2146 void *
2147 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2148 {
2149   return nvptx_get_current_cuda_device ();
2150 }
2151
2152 void *
2153 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2154 {
2155   return nvptx_get_current_cuda_context ();
2156 }
2157
2158 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2159
2160 void *
2161 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2162 {
2163   return nvptx_get_cuda_stream (async);
2164 }
2165
2166 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2167
2168 int
2169 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2170 {
2171   return nvptx_set_cuda_stream (async, stream);
2172 }
2173
2174 /* Adjust launch dimensions: pick good values for number of blocks and warps
2175    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2176    own limits.  */
2177
2178 static void
2179 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2180                             struct ptx_device *ptx_dev,
2181                             int *teams_p, int *threads_p)
2182 {
2183   int max_warps_block = fn->max_threads_per_block / 32;
2184   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2185      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2186   if (max_warps_block > 32)
2187     max_warps_block = 32;
2188   if (*threads_p <= 0)
2189     *threads_p = 8;
2190   if (*threads_p > max_warps_block)
2191     *threads_p = max_warps_block;
2192
2193   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2194   /* This is an estimate of how many blocks the device can host simultaneously.
2195      Actual limit, which may be lower, can be queried with "occupancy control"
2196      driver interface (since CUDA 6.0).  */
2197   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2198   if (*teams_p <= 0 || *teams_p > max_blocks)
2199     *teams_p = max_blocks;
2200 }
2201
2202 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2203    target regions.  */
2204
2205 static size_t
2206 nvptx_stacks_size ()
2207 {
2208   return 128 * 1024;
2209 }
2210
2211 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2212
2213 static void *
2214 nvptx_stacks_alloc (size_t size, int num)
2215 {
2216   CUdeviceptr stacks;
2217   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2218   if (r != CUDA_SUCCESS)
2219     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2220   return (void *) stacks;
2221 }
2222
2223 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2224
2225 static void
2226 nvptx_stacks_free (void *p, int num)
2227 {
2228   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2229   if (r != CUDA_SUCCESS)
2230     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2231 }
2232
2233 void
2234 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2235 {
2236   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2237   CUresult r;
2238   struct ptx_device *ptx_dev = ptx_devices[ord];
2239   const char *maybe_abort_msg = "(perhaps abort was called)";
2240   int teams = 0, threads = 0;
2241
2242   if (!args)
2243     GOMP_PLUGIN_fatal ("No target arguments provided");
2244   while (*args)
2245     {
2246       intptr_t id = (intptr_t) *args++, val;
2247       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2248         val = (intptr_t) *args++;
2249       else
2250         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2251       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2252         continue;
2253       val = val > INT_MAX ? INT_MAX : val;
2254       id &= GOMP_TARGET_ARG_ID_MASK;
2255       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2256         teams = val;
2257       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2258         threads = val;
2259     }
2260   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2261
2262   size_t stack_size = nvptx_stacks_size ();
2263   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2264   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2265   size_t fn_args_size = sizeof fn_args;
2266   void *config[] = {
2267     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2268     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2269     CU_LAUNCH_PARAM_END
2270   };
2271   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2272                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2273                          NULL, config);
2274   if (r != CUDA_SUCCESS)
2275     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2276
2277   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2278   if (r == CUDA_ERROR_LAUNCH_FAILED)
2279     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2280                        maybe_abort_msg);
2281   else if (r != CUDA_SUCCESS)
2282     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2283   nvptx_stacks_free (stacks, teams * threads);
2284 }
2285
2286 void
2287 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2288                         void *async_data)
2289 {
2290   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2291 }