1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
52 cuda_error (CUresult r
)
54 #if CUDA_VERSION < 7000
55 /* Specified in documentation and present in library from at least
56 5.5. Not declared in header file prior to 7.0. */
57 extern CUresult
cuGetErrorString (CUresult
, const char **);
61 r
= cuGetErrorString (r
, &desc
);
62 if (r
!= CUDA_SUCCESS
)
63 desc
= "unknown cuda error";
68 /* Convenience macros for the frequently used CUDA library call and
69 error handling sequence. This does not capture all the cases we
70 use in this file, but is common enough. */
72 #define CUDA_CALL_ERET(ERET, FN, ...) \
74 unsigned __r = FN (__VA_ARGS__); \
75 if (__r != CUDA_SUCCESS) \
77 GOMP_PLUGIN_error (#FN " error: %s", \
83 #define CUDA_CALL(FN, ...) \
84 CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
86 #define CUDA_CALL_ASSERT(FN, ...) \
88 unsigned __r = FN (__VA_ARGS__); \
89 if (__r != CUDA_SUCCESS) \
91 GOMP_PLUGIN_fatal (#FN " error: %s", \
96 static unsigned int instantiated_devices
= 0;
97 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
102 pthread_t host_thread
;
113 struct ptx_stream
*next
;
116 /* Thread-specific data for PTX. */
120 struct ptx_stream
*current_stream
;
121 struct ptx_device
*ptx_dev
;
132 map_init (struct ptx_stream
*s
)
134 int size
= getpagesize ();
140 CUDA_CALL (cuMemAllocHost
, &s
->h
, size
);
141 CUDA_CALL (cuMemHostGetDevicePointer
, &s
->d
, s
->h
, 0);
146 s
->h_end
= s
->h_begin
+ size
;
147 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
155 map_fini (struct ptx_stream
*s
)
157 CUDA_CALL (cuMemFreeHost
, s
->h
);
162 map_pop (struct ptx_stream
*s
)
173 s
->h_tail
+= m
->size
;
175 if (s
->h_tail
>= s
->h_end
)
176 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
178 if (s
->h_next
== s
->h_tail
)
179 s
->h_prev
= s
->h_next
;
181 assert (s
->h_next
>= s
->h_begin
);
182 assert (s
->h_tail
>= s
->h_begin
);
183 assert (s
->h_prev
>= s
->h_begin
);
185 assert (s
->h_next
<= s
->h_end
);
186 assert (s
->h_tail
<= s
->h_end
);
187 assert (s
->h_prev
<= s
->h_end
);
191 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
199 left
= s
->h_end
- s
->h_next
;
200 size
+= sizeof (struct map
);
209 s
->h_next
= s
->h_begin
;
211 if (s
->h_next
+ size
> s
->h_end
)
212 GOMP_PLUGIN_fatal ("unable to push map");
221 offset
= (void *)&m
->mappings
[0] - s
->h
;
223 *d
= (void *)(s
->d
+ offset
);
224 *h
= (void *)(s
->h
+ offset
);
226 s
->h_prev
= s
->h_next
;
232 assert (s
->h_next
>= s
->h_begin
);
233 assert (s
->h_tail
>= s
->h_begin
);
234 assert (s
->h_prev
>= s
->h_begin
);
235 assert (s
->h_next
<= s
->h_end
);
236 assert (s
->h_tail
<= s
->h_end
);
237 assert (s
->h_prev
<= s
->h_end
);
242 /* Target data function launch information. */
244 struct targ_fn_launch
247 unsigned short dim
[GOMP_DIM_MAX
];
250 /* Target PTX object information. */
258 /* Target data image information. */
260 typedef struct nvptx_tdata
262 const struct targ_ptx_obj
*ptx_objs
;
265 const char *const *var_names
;
268 const struct targ_fn_launch
*fn_descs
;
272 /* Descriptor of a loaded function. */
274 struct targ_fn_descriptor
277 const struct targ_fn_launch
*launch
;
279 int max_threads_per_block
;
282 /* A loaded PTX image. */
283 struct ptx_image_data
285 const void *target_data
;
288 struct targ_fn_descriptor
*fns
; /* Array of functions. */
290 struct ptx_image_data
*next
;
298 struct ptx_stream
*null_stream
;
299 /* All non-null streams associated with this device (actually context),
300 either created implicitly or passed in from the user (via
301 acc_set_cuda_stream). */
302 struct ptx_stream
*active_streams
;
304 struct ptx_stream
**arr
;
307 /* A lock for use when manipulating the above stream list and array. */
308 pthread_mutex_t stream_lock
;
320 struct ptx_image_data
*images
; /* Images loaded on device. */
321 pthread_mutex_t image_lock
; /* Lock for above list. */
323 struct ptx_device
*next
;
331 PTX_EVT_ASYNC_CLEANUP
342 struct ptx_event
*next
;
345 static pthread_mutex_t ptx_event_lock
;
346 static struct ptx_event
*ptx_events
;
348 static struct ptx_device
**ptx_devices
;
350 static inline struct nvptx_thread
*
353 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
357 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
360 struct ptx_stream
*null_stream
361 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
363 null_stream
->stream
= NULL
;
364 null_stream
->host_thread
= pthread_self ();
365 null_stream
->multithreaded
= true;
366 null_stream
->d
= (CUdeviceptr
) NULL
;
367 null_stream
->h
= NULL
;
368 if (!map_init (null_stream
))
371 ptx_dev
->null_stream
= null_stream
;
372 ptx_dev
->active_streams
= NULL
;
373 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
378 /* This is just a guess -- make space for as many async streams as the
379 current device is capable of concurrently executing. This can grow
380 later as necessary. No streams are created yet. */
381 ptx_dev
->async_streams
.arr
382 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
383 ptx_dev
->async_streams
.size
= concurrency
;
385 for (i
= 0; i
< concurrency
; i
++)
386 ptx_dev
->async_streams
.arr
[i
] = NULL
;
392 fini_streams_for_device (struct ptx_device
*ptx_dev
)
394 free (ptx_dev
->async_streams
.arr
);
397 while (ptx_dev
->active_streams
!= NULL
)
399 struct ptx_stream
*s
= ptx_dev
->active_streams
;
400 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
404 CUresult r
= cuStreamDestroy (s
->stream
);
405 if (r
!= CUDA_SUCCESS
)
407 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
413 ret
&= map_fini (ptx_dev
->null_stream
);
414 free (ptx_dev
->null_stream
);
418 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
419 thread THREAD (and also current device/context). If CREATE is true, create
420 the stream if it does not exist (or use EXISTING if it is non-NULL), and
421 associate the stream with the same thread argument. Returns stream to use
424 static struct ptx_stream
*
425 select_stream_for_async (int async
, pthread_t thread
, bool create
,
428 struct nvptx_thread
*nvthd
= nvptx_thread ();
429 /* Local copy of TLS variable. */
430 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
431 struct ptx_stream
*stream
= NULL
;
432 int orig_async
= async
;
434 /* The special value acc_async_noval (-1) maps (for now) to an
435 implicitly-created stream, which is then handled the same as any other
436 numbered async stream. Other options are available, e.g. using the null
437 stream for anonymous async operations, or choosing an idle stream from an
438 active set. But, stick with this for now. */
439 if (async
> acc_async_sync
)
443 pthread_mutex_lock (&ptx_dev
->stream_lock
);
445 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
446 null stream, and in fact better performance may be obtainable if it doesn't
447 (because the null stream enforces overly-strict synchronisation with
448 respect to other streams for legacy reasons, and that's probably not
449 needed with OpenACC). Maybe investigate later. */
450 if (async
== acc_async_sync
)
451 stream
= ptx_dev
->null_stream
;
452 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
453 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
454 stream
= ptx_dev
->async_streams
.arr
[async
];
455 else if (async
>= 0 && create
)
457 if (async
>= ptx_dev
->async_streams
.size
)
459 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
461 if (async
>= newsize
)
464 ptx_dev
->async_streams
.arr
465 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
466 newsize
* sizeof (struct ptx_stream
*));
468 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
469 ptx_dev
->async_streams
.arr
[i
] = NULL
;
471 ptx_dev
->async_streams
.size
= newsize
;
474 /* Create a new stream on-demand if there isn't one already, or if we're
475 setting a particular async value to an existing (externally-provided)
477 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
481 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
484 s
->stream
= existing
;
487 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
488 if (r
!= CUDA_SUCCESS
)
490 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
491 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
496 /* If CREATE is true, we're going to be queueing some work on this
497 stream. Associate it with the current host thread. */
498 s
->host_thread
= thread
;
499 s
->multithreaded
= false;
501 s
->d
= (CUdeviceptr
) NULL
;
505 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
506 GOMP_PLUGIN_fatal ("map_init fail");
509 s
->next
= ptx_dev
->active_streams
;
510 ptx_dev
->active_streams
= s
;
511 ptx_dev
->async_streams
.arr
[async
] = s
;
514 stream
= ptx_dev
->async_streams
.arr
[async
];
519 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
520 GOMP_PLUGIN_fatal ("bad async %d", async
);
525 assert (stream
!= NULL
);
527 /* If we're trying to use the same stream from different threads
528 simultaneously, set stream->multithreaded to true. This affects the
529 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
530 only wait for asynchronous launches from the same host thread they are
531 invoked on. If multiple threads use the same async value, we make note
532 of that here and fall back to testing/waiting for all threads in those
534 if (thread
!= stream
->host_thread
)
535 stream
->multithreaded
= true;
537 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
539 else if (stream
&& !stream
->multithreaded
540 && !pthread_equal (stream
->host_thread
, thread
))
541 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
546 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
547 should be locked on entry and remains locked on exit. */
554 if (instantiated_devices
!= 0)
557 CUDA_CALL (cuInit
, 0);
559 pthread_mutex_init (&ptx_event_lock
, NULL
);
561 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
562 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
567 /* Select the N'th PTX device for the current host thread. The device must
568 have been previously opened before calling this function. */
571 nvptx_attach_host_thread_to_device (int n
)
575 struct ptx_device
*ptx_dev
;
578 r
= cuCtxGetDevice (&dev
);
579 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
581 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
585 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
591 ptx_dev
= ptx_devices
[n
];
594 GOMP_PLUGIN_error ("device %d not found", n
);
598 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
600 /* We don't necessarily have a current context (e.g. if it has been
601 destroyed. Pop it if we do though. */
603 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
605 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
610 static struct ptx_device
*
611 nvptx_open_device (int n
)
613 struct ptx_device
*ptx_dev
;
614 CUdevice dev
, ctx_dev
;
616 int async_engines
, pi
;
618 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
620 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
624 ptx_dev
->ctx_shared
= false;
626 r
= cuCtxGetDevice (&ctx_dev
);
627 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
629 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
633 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
635 /* The current host thread has an active context for a different device.
638 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
641 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
644 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
646 ptx_dev
->ctx_shared
= true;
648 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
649 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
650 ptx_dev
->overlap
= pi
;
652 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
653 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
656 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
657 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
658 ptx_dev
->concur
= pi
;
660 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
661 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
664 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
665 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
668 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
669 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
670 ptx_dev
->clock_khz
= pi
;
672 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
673 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
674 ptx_dev
->num_sms
= pi
;
676 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
677 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
678 ptx_dev
->regs_per_block
= pi
;
680 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
681 in CUDA 6.0 and newer. */
682 r
= cuDeviceGetAttribute (&pi
, 82, dev
);
683 /* Fallback: use limit of registers per block, which is usually equal. */
684 if (r
== CUDA_ERROR_INVALID_VALUE
)
685 pi
= ptx_dev
->regs_per_block
;
686 else if (r
!= CUDA_SUCCESS
)
688 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
691 ptx_dev
->regs_per_sm
= pi
;
693 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
694 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
697 GOMP_PLUGIN_error ("Only warp size 32 is supported");
701 r
= cuDeviceGetAttribute (&async_engines
,
702 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
703 if (r
!= CUDA_SUCCESS
)
706 ptx_dev
->images
= NULL
;
707 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
709 if (!init_streams_for_device (ptx_dev
, async_engines
))
716 nvptx_close_device (struct ptx_device
*ptx_dev
)
721 if (!fini_streams_for_device (ptx_dev
))
724 pthread_mutex_destroy (&ptx_dev
->image_lock
);
726 if (!ptx_dev
->ctx_shared
)
727 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
734 nvptx_get_num_devices (void)
738 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
740 if (sizeof (void *) != 8)
743 /* This function will be called before the plugin has been initialized in
744 order to enumerate available devices, but CUDA API routines can't be used
745 until cuInit has been called. Just call it now (but don't yet do any
746 further initialization). */
747 if (instantiated_devices
== 0)
749 CUresult r
= cuInit (0);
750 /* This is not an error: e.g. we may have CUDA libraries installed but
751 no devices available. */
752 if (r
!= CUDA_SUCCESS
)
756 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
762 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
765 CUjit_option opts
[6];
770 CUlinkState linkstate
;
773 size_t linkoutsize
__attribute__ ((unused
));
775 opts
[0] = CU_JIT_WALL_TIME
;
776 optvals
[0] = &elapsed
;
778 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
779 optvals
[1] = &ilog
[0];
781 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
782 optvals
[2] = (void *) sizeof ilog
;
784 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
785 optvals
[3] = &elog
[0];
787 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
788 optvals
[4] = (void *) sizeof elog
;
790 opts
[5] = CU_JIT_LOG_VERBOSE
;
791 optvals
[5] = (void *) 1;
793 CUDA_CALL (cuLinkCreate
, 6, opts
, optvals
, &linkstate
);
795 for (; num_objs
--; ptx_objs
++)
797 /* cuLinkAddData's 'data' argument erroneously omits the const
799 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
800 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char*)ptx_objs
->code
,
801 ptx_objs
->size
, 0, 0, 0, 0);
802 if (r
!= CUDA_SUCCESS
)
804 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
805 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
811 GOMP_PLUGIN_debug (0, "Linking\n");
812 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
814 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
815 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
817 if (r
!= CUDA_SUCCESS
)
819 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
823 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
824 CUDA_CALL (cuLinkDestroy
, linkstate
);
829 event_gc (bool memmap_lockable
)
831 struct ptx_event
*ptx_event
= ptx_events
;
832 struct ptx_event
*async_cleanups
= NULL
;
833 struct nvptx_thread
*nvthd
= nvptx_thread ();
835 pthread_mutex_lock (&ptx_event_lock
);
837 while (ptx_event
!= NULL
)
840 struct ptx_event
*e
= ptx_event
;
842 ptx_event
= ptx_event
->next
;
844 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
847 r
= cuEventQuery (*e
->evt
);
848 if (r
== CUDA_SUCCESS
)
850 bool append_async
= false;
865 case PTX_EVT_ASYNC_CLEANUP
:
867 /* The function gomp_plugin_async_unmap_vars needs to claim the
868 memory-map splay tree lock for the current device, so we
869 can't call it when one of our callers has already claimed
870 the lock. In that case, just delay the GC for this event
872 if (!memmap_lockable
)
880 cuEventDestroy (*te
);
883 /* Unlink 'e' from ptx_events list. */
885 ptx_events
= ptx_events
->next
;
888 struct ptx_event
*e_
= ptx_events
;
889 while (e_
->next
!= e
)
891 e_
->next
= e_
->next
->next
;
896 e
->next
= async_cleanups
;
904 pthread_mutex_unlock (&ptx_event_lock
);
906 /* We have to do these here, after ptx_event_lock is released. */
907 while (async_cleanups
)
909 struct ptx_event
*e
= async_cleanups
;
910 async_cleanups
= async_cleanups
->next
;
912 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
918 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
920 struct ptx_event
*ptx_event
;
921 struct nvptx_thread
*nvthd
= nvptx_thread ();
923 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
924 || type
== PTX_EVT_ASYNC_CLEANUP
);
926 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
927 ptx_event
->type
= type
;
930 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
931 ptx_event
->val
= val
;
933 pthread_mutex_lock (&ptx_event_lock
);
935 ptx_event
->next
= ptx_events
;
936 ptx_events
= ptx_event
;
938 pthread_mutex_unlock (&ptx_event_lock
);
942 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
943 int async
, unsigned *dims
, void *targ_mem_desc
)
945 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
949 struct ptx_stream
*dev_str
;
952 struct nvptx_thread
*nvthd
= nvptx_thread ();
953 const char *maybe_abort_msg
= "(perhaps abort was called)";
955 function
= targ_fn
->fn
;
957 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
958 assert (dev_str
== nvthd
->current_stream
);
960 /* Initialize the launch dimensions. Typically this is constant,
961 provided by the device compiler, but we must permit runtime
964 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
966 if (targ_fn
->launch
->dim
[i
])
967 dims
[i
] = targ_fn
->launch
->dim
[i
];
974 /* See if the user provided GOMP_OPENACC_DIM environment
975 variable to specify runtime defaults. */
976 static int default_dims
[GOMP_DIM_MAX
];
978 pthread_mutex_lock (&ptx_dev_lock
);
979 if (!default_dims
[0])
981 /* We only read the environment variable once. You can't
982 change it in the middle of execution. The syntax is
983 the same as for the -fopenacc-dim compilation option. */
984 const char *env_var
= getenv ("GOMP_OPENACC_DIM");
987 const char *pos
= env_var
;
989 for (i
= 0; *pos
&& i
!= GOMP_DIM_MAX
; i
++)
991 if (i
&& *pos
++ != ':')
998 long val
= strtol (pos
, (char **)&eptr
, 10);
999 if (errno
|| val
< 0 || (unsigned)val
!= val
)
1001 default_dims
[i
] = (int)val
;
1007 int warp_size
, block_size
, dev_size
, cpu_size
;
1008 CUdevice dev
= nvptx_thread()->ptx_dev
->dev
;
1009 /* 32 is the default for known hardware. */
1010 int gang
= 0, worker
= 32, vector
= 32;
1011 CUdevice_attribute cu_tpb
, cu_ws
, cu_mpc
, cu_tpm
;
1013 cu_tpb
= CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
;
1014 cu_ws
= CU_DEVICE_ATTRIBUTE_WARP_SIZE
;
1015 cu_mpc
= CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
;
1016 cu_tpm
= CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
;
1018 if (cuDeviceGetAttribute (&block_size
, cu_tpb
, dev
) == CUDA_SUCCESS
1019 && cuDeviceGetAttribute (&warp_size
, cu_ws
, dev
) == CUDA_SUCCESS
1020 && cuDeviceGetAttribute (&dev_size
, cu_mpc
, dev
) == CUDA_SUCCESS
1021 && cuDeviceGetAttribute (&cpu_size
, cu_tpm
, dev
) == CUDA_SUCCESS
)
1023 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1024 " dev_size=%d, cpu_size=%d\n",
1025 warp_size
, block_size
, dev_size
, cpu_size
);
1026 gang
= (cpu_size
/ block_size
) * dev_size
;
1027 worker
= block_size
/ warp_size
;
1031 /* There is no upper bound on the gang size. The best size
1032 matches the hardware configuration. Logical gangs are
1033 scheduled onto physical hardware. To maximize usage, we
1034 should guess a large number. */
1035 if (default_dims
[GOMP_DIM_GANG
] < 1)
1036 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
1037 /* The worker size must not exceed the hardware. */
1038 if (default_dims
[GOMP_DIM_WORKER
] < 1
1039 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
1040 default_dims
[GOMP_DIM_WORKER
] = worker
;
1041 /* The vector size must exactly match the hardware. */
1042 if (default_dims
[GOMP_DIM_VECTOR
] < 1
1043 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
1044 default_dims
[GOMP_DIM_VECTOR
] = vector
;
1046 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1047 default_dims
[GOMP_DIM_GANG
],
1048 default_dims
[GOMP_DIM_WORKER
],
1049 default_dims
[GOMP_DIM_VECTOR
]);
1051 pthread_mutex_unlock (&ptx_dev_lock
);
1053 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1055 dims
[i
] = default_dims
[i
];
1058 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1059 the host and the device. HP is a host pointer to the new chunk, and DP is
1060 the corresponding device pointer. */
1061 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
1063 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1065 /* Copy the array of arguments to the mapped page. */
1066 for (i
= 0; i
< mapnum
; i
++)
1067 ((void **) hp
)[i
] = devaddrs
[i
];
1069 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1070 fact have the same value on a unified-memory system). */
1071 CUDA_CALL_ASSERT (cuMemcpy
, (CUdeviceptr
) dp
, (CUdeviceptr
) hp
,
1072 mapnum
* sizeof (void *));
1073 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1074 " gangs=%u, workers=%u, vectors=%u\n",
1075 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1076 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1080 // num_gangs nctaid.x
1081 // num_workers ntid.y
1082 // vector length ntid.x
1085 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1086 dims
[GOMP_DIM_GANG
], 1, 1,
1087 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1088 0, dev_str
->stream
, kargs
, 0);
1090 #ifndef DISABLE_ASYNC
1091 if (async
< acc_async_noval
)
1093 r
= cuStreamSynchronize (dev_str
->stream
);
1094 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1095 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1097 else if (r
!= CUDA_SUCCESS
)
1098 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1104 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1106 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1107 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1108 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1110 else if (r
!= CUDA_SUCCESS
)
1111 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1115 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
1117 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1120 r
= cuCtxSynchronize ();
1121 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1122 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1124 else if (r
!= CUDA_SUCCESS
)
1125 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1128 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1129 targ_fn
->launch
->fn
);
1131 #ifndef DISABLE_ASYNC
1132 if (async
< acc_async_noval
)
1137 void * openacc_get_current_cuda_context (void);
1140 nvptx_alloc (size_t s
)
1144 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1149 nvptx_free (void *p
)
1154 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1155 if ((CUdeviceptr
) p
!= pb
)
1157 GOMP_PLUGIN_error ("invalid device address");
1161 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1167 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1171 struct nvptx_thread
*nvthd
= nvptx_thread ();
1177 GOMP_PLUGIN_error ("invalid device address");
1181 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1185 GOMP_PLUGIN_error ("invalid device address");
1190 GOMP_PLUGIN_error ("invalid host address");
1195 GOMP_PLUGIN_error ("invalid host or device address");
1198 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1200 GOMP_PLUGIN_error ("invalid size");
1204 #ifndef DISABLE_ASYNC
1205 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1207 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1208 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1210 CUDA_CALL (cuMemcpyHtoDAsync
,
1211 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1212 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1213 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1217 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1223 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1227 struct nvptx_thread
*nvthd
= nvptx_thread ();
1233 GOMP_PLUGIN_error ("invalid device address");
1237 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1241 GOMP_PLUGIN_error ("invalid device address");
1246 GOMP_PLUGIN_error ("invalid host address");
1251 GOMP_PLUGIN_error ("invalid host or device address");
1254 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1256 GOMP_PLUGIN_error ("invalid size");
1260 #ifndef DISABLE_ASYNC
1261 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1263 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1264 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1266 CUDA_CALL (cuMemcpyDtoHAsync
,
1267 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1268 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1269 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1273 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1279 nvptx_set_async (int async
)
1281 struct nvptx_thread
*nvthd
= nvptx_thread ();
1282 nvthd
->current_stream
1283 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1287 nvptx_async_test (int async
)
1290 struct ptx_stream
*s
;
1292 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1295 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1297 r
= cuStreamQuery (s
->stream
);
1298 if (r
== CUDA_SUCCESS
)
1300 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1301 whether all work has completed on this stream, and if so omits the call
1302 to the wait hook. If that happens, event_gc might not get called
1303 (which prevents variables from getting unmapped and their associated
1304 device storage freed), so call it here. */
1308 else if (r
== CUDA_ERROR_NOT_READY
)
1311 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1317 nvptx_async_test_all (void)
1319 struct ptx_stream
*s
;
1320 pthread_t self
= pthread_self ();
1321 struct nvptx_thread
*nvthd
= nvptx_thread ();
1323 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1325 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1327 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1328 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1330 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1335 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1343 nvptx_wait (int async
)
1345 struct ptx_stream
*s
;
1347 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1349 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1351 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1357 nvptx_wait_async (int async1
, int async2
)
1360 struct ptx_stream
*s1
, *s2
;
1361 pthread_t self
= pthread_self ();
1363 /* The stream that is waiting (rather than being waited for) doesn't
1364 necessarily have to exist already. */
1365 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1367 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1369 GOMP_PLUGIN_fatal ("invalid async 1\n");
1372 GOMP_PLUGIN_fatal ("identical parameters");
1374 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1376 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1380 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1382 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1384 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1388 nvptx_wait_all (void)
1391 struct ptx_stream
*s
;
1392 pthread_t self
= pthread_self ();
1393 struct nvptx_thread
*nvthd
= nvptx_thread ();
1395 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1397 /* Wait for active streams initiated by this thread (or by multiple threads)
1399 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1401 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1403 r
= cuStreamQuery (s
->stream
);
1404 if (r
== CUDA_SUCCESS
)
1406 else if (r
!= CUDA_ERROR_NOT_READY
)
1407 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1409 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1413 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1419 nvptx_wait_all_async (int async
)
1421 struct ptx_stream
*waiting_stream
, *other_stream
;
1423 struct nvptx_thread
*nvthd
= nvptx_thread ();
1424 pthread_t self
= pthread_self ();
1426 /* The stream doing the waiting. This could be the first mention of the
1427 stream, so create it if necessary. */
1429 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1431 /* Launches on the null stream already block on other streams in the
1433 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1438 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1440 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1441 other_stream
!= NULL
;
1442 other_stream
= other_stream
->next
)
1444 if (!other_stream
->multithreaded
1445 && !pthread_equal (other_stream
->host_thread
, self
))
1448 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1450 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1452 /* Record an event on the waited-for stream. */
1453 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1455 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1457 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1460 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1464 nvptx_get_current_cuda_device (void)
1466 struct nvptx_thread
*nvthd
= nvptx_thread ();
1468 if (!nvthd
|| !nvthd
->ptx_dev
)
1471 return &nvthd
->ptx_dev
->dev
;
1475 nvptx_get_current_cuda_context (void)
1477 struct nvptx_thread
*nvthd
= nvptx_thread ();
1479 if (!nvthd
|| !nvthd
->ptx_dev
)
1482 return nvthd
->ptx_dev
->ctx
;
1486 nvptx_get_cuda_stream (int async
)
1488 struct ptx_stream
*s
;
1489 struct nvptx_thread
*nvthd
= nvptx_thread ();
1491 if (!nvthd
|| !nvthd
->ptx_dev
)
1494 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1496 return s
? s
->stream
: NULL
;
1500 nvptx_set_cuda_stream (int async
, void *stream
)
1502 struct ptx_stream
*oldstream
;
1503 pthread_t self
= pthread_self ();
1504 struct nvptx_thread
*nvthd
= nvptx_thread ();
1507 GOMP_PLUGIN_fatal ("bad async %d", async
);
1509 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1511 /* We have a list of active streams and an array mapping async values to
1512 entries of that list. We need to take "ownership" of the passed-in stream,
1513 and add it to our list, removing the previous entry also (if there was one)
1514 in order to prevent resource leaks. Note the potential for surprise
1515 here: maybe we should keep track of passed-in streams and leave it up to
1516 the user to tidy those up, but that doesn't work for stream handles
1517 returned from acc_get_cuda_stream above... */
1519 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1523 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1524 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1527 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1528 while (s
->next
!= oldstream
)
1530 s
->next
= s
->next
->next
;
1533 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1535 if (!map_fini (oldstream
))
1536 GOMP_PLUGIN_fatal ("error when freeing host memory");
1541 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1543 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1548 /* Plugin entry points. */
1551 GOMP_OFFLOAD_get_name (void)
1557 GOMP_OFFLOAD_get_caps (void)
1559 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1563 GOMP_OFFLOAD_get_type (void)
1565 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1569 GOMP_OFFLOAD_get_num_devices (void)
1571 return nvptx_get_num_devices ();
1575 GOMP_OFFLOAD_init_device (int n
)
1577 struct ptx_device
*dev
;
1579 pthread_mutex_lock (&ptx_dev_lock
);
1581 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1583 pthread_mutex_unlock (&ptx_dev_lock
);
1587 dev
= nvptx_open_device (n
);
1590 ptx_devices
[n
] = dev
;
1591 instantiated_devices
++;
1594 pthread_mutex_unlock (&ptx_dev_lock
);
1600 GOMP_OFFLOAD_fini_device (int n
)
1602 pthread_mutex_lock (&ptx_dev_lock
);
1604 if (ptx_devices
[n
] != NULL
)
1606 if (!nvptx_attach_host_thread_to_device (n
)
1607 || !nvptx_close_device (ptx_devices
[n
]))
1609 pthread_mutex_unlock (&ptx_dev_lock
);
1612 ptx_devices
[n
] = NULL
;
1613 instantiated_devices
--;
1616 pthread_mutex_unlock (&ptx_dev_lock
);
1620 /* Return the libgomp version number we're compatible with. There is
1621 no requirement for cross-version compatibility. */
1624 GOMP_OFFLOAD_version (void)
1626 return GOMP_VERSION
;
1629 /* Initialize __nvptx_clocktick, if present in MODULE. */
1632 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1635 CUresult r
= cuModuleGetGlobal (&dptr
, NULL
, module
, "__nvptx_clocktick");
1636 if (r
== CUDA_ERROR_NOT_FOUND
)
1638 if (r
!= CUDA_SUCCESS
)
1639 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1640 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1641 r
= cuMemcpyHtoD (dptr
, &__nvptx_clocktick
, sizeof (__nvptx_clocktick
));
1642 if (r
!= CUDA_SUCCESS
)
1643 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1646 /* Load the (partial) program described by TARGET_DATA to device
1647 number ORD. Allocate and return TARGET_TABLE. */
1650 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1651 struct addr_pair
**target_table
)
1654 const char *const *var_names
;
1655 const struct targ_fn_launch
*fn_descs
;
1656 unsigned int fn_entries
, var_entries
, i
, j
;
1657 struct targ_fn_descriptor
*targ_fns
;
1658 struct addr_pair
*targ_tbl
;
1659 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1660 struct ptx_image_data
*new_image
;
1661 struct ptx_device
*dev
;
1663 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1665 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1666 " (expected %u, received %u)",
1667 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1671 if (!nvptx_attach_host_thread_to_device (ord
)
1672 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1675 dev
= ptx_devices
[ord
];
1677 /* The mkoffload utility emits a struct of pointers/integers at the
1678 start of each offload image. The array of kernel names and the
1679 functions addresses form a one-to-one correspondence. */
1681 var_entries
= img_header
->var_num
;
1682 var_names
= img_header
->var_names
;
1683 fn_entries
= img_header
->fn_num
;
1684 fn_descs
= img_header
->fn_descs
;
1686 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1687 * (fn_entries
+ var_entries
));
1688 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1691 *target_table
= targ_tbl
;
1693 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1694 new_image
->target_data
= target_data
;
1695 new_image
->module
= module
;
1696 new_image
->fns
= targ_fns
;
1698 pthread_mutex_lock (&dev
->image_lock
);
1699 new_image
->next
= dev
->images
;
1700 dev
->images
= new_image
;
1701 pthread_mutex_unlock (&dev
->image_lock
);
1703 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1705 CUfunction function
;
1708 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1710 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1711 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1712 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1713 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1715 targ_fns
->fn
= function
;
1716 targ_fns
->launch
= &fn_descs
[i
];
1717 targ_fns
->regs_per_thread
= nregs
;
1718 targ_fns
->max_threads_per_block
= mthrs
;
1720 targ_tbl
->start
= (uintptr_t) targ_fns
;
1721 targ_tbl
->end
= targ_tbl
->start
+ 1;
1724 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1729 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1730 &var
, &bytes
, module
, var_names
[j
]);
1732 targ_tbl
->start
= (uintptr_t) var
;
1733 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1736 nvptx_set_clocktick (module
, dev
);
1738 return fn_entries
+ var_entries
;
1741 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1742 function descriptors allocated by G_O_load_image. */
1745 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1747 struct ptx_image_data
*image
, **prev_p
;
1748 struct ptx_device
*dev
= ptx_devices
[ord
];
1750 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1752 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1753 " (expected %u, received %u)",
1754 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1759 pthread_mutex_lock (&dev
->image_lock
);
1760 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1761 if (image
->target_data
== target_data
)
1763 *prev_p
= image
->next
;
1764 if (cuModuleUnload (image
->module
) != CUDA_SUCCESS
)
1770 pthread_mutex_unlock (&dev
->image_lock
);
1775 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1777 if (!nvptx_attach_host_thread_to_device (ord
))
1779 return nvptx_alloc (size
);
1783 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1785 return (nvptx_attach_host_thread_to_device (ord
)
1786 && nvptx_free (ptr
));
1790 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1792 return (nvptx_attach_host_thread_to_device (ord
)
1793 && nvptx_dev2host (dst
, src
, n
));
1797 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1799 return (nvptx_attach_host_thread_to_device (ord
)
1800 && nvptx_host2dev (dst
, src
, n
));
1804 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1806 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1807 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
,
1808 ptx_dev
->null_stream
->stream
);
1812 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1815 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1816 void **hostaddrs
, void **devaddrs
,
1817 int async
, unsigned *dims
, void *targ_mem_desc
)
1819 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
1823 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
1825 struct nvptx_thread
*nvthd
= nvptx_thread ();
1826 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1828 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1829 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1830 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
1834 GOMP_OFFLOAD_openacc_async_test (int async
)
1836 return nvptx_async_test (async
);
1840 GOMP_OFFLOAD_openacc_async_test_all (void)
1842 return nvptx_async_test_all ();
1846 GOMP_OFFLOAD_openacc_async_wait (int async
)
1852 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1854 nvptx_wait_async (async1
, async2
);
1858 GOMP_OFFLOAD_openacc_async_wait_all (void)
1864 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1866 nvptx_wait_all_async (async
);
1870 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1872 nvptx_set_async (async
);
1876 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1878 struct ptx_device
*ptx_dev
;
1879 struct nvptx_thread
*nvthd
1880 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1883 ptx_dev
= ptx_devices
[ord
];
1887 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1889 assert (ptx_dev
->ctx
);
1892 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1894 nvthd
->current_stream
= ptx_dev
->null_stream
;
1895 nvthd
->ptx_dev
= ptx_dev
;
1897 return (void *) nvthd
;
1901 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1907 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1909 return nvptx_get_current_cuda_device ();
1913 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1915 return nvptx_get_current_cuda_context ();
1918 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1921 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1923 return nvptx_get_cuda_stream (async
);
1926 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1929 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1931 return nvptx_set_cuda_stream (async
, stream
);
1934 /* Adjust launch dimensions: pick good values for number of blocks and warps
1935 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1939 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
1940 struct ptx_device
*ptx_dev
,
1941 int *teams_p
, int *threads_p
)
1943 int max_warps_block
= fn
->max_threads_per_block
/ 32;
1944 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1945 and libgcc, which matches documented limit of all GPUs as of 2015. */
1946 if (max_warps_block
> 32)
1947 max_warps_block
= 32;
1948 if (*threads_p
<= 0)
1950 if (*threads_p
> max_warps_block
)
1951 *threads_p
= max_warps_block
;
1953 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
1954 /* This is an estimate of how many blocks the device can host simultaneously.
1955 Actual limit, which may be lower, can be queried with "occupancy control"
1956 driver interface (since CUDA 6.0). */
1957 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
1958 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
1959 *teams_p
= max_blocks
;
1962 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1966 nvptx_stacks_size ()
1971 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1974 nvptx_stacks_alloc (size_t size
, int num
)
1977 CUresult r
= cuMemAlloc (&stacks
, size
* num
);
1978 if (r
!= CUDA_SUCCESS
)
1979 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1980 return (void *) stacks
;
1983 /* Release storage previously allocated by nvptx_stacks_alloc. */
1986 nvptx_stacks_free (void *p
, int num
)
1988 CUresult r
= cuMemFree ((CUdeviceptr
) p
);
1989 if (r
!= CUDA_SUCCESS
)
1990 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1994 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
1996 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
1998 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1999 const char *maybe_abort_msg
= "(perhaps abort was called)";
2000 int teams
= 0, threads
= 0;
2003 GOMP_PLUGIN_fatal ("No target arguments provided");
2006 intptr_t id
= (intptr_t) *args
++, val
;
2007 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2008 val
= (intptr_t) *args
++;
2010 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2011 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2013 val
= val
> INT_MAX
? INT_MAX
: val
;
2014 id
&= GOMP_TARGET_ARG_ID_MASK
;
2015 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2017 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2020 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2022 size_t stack_size
= nvptx_stacks_size ();
2023 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
2024 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2025 size_t fn_args_size
= sizeof fn_args
;
2027 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2028 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2031 r
= cuLaunchKernel (function
,
2034 0, ptx_dev
->null_stream
->stream
, NULL
, config
);
2035 if (r
!= CUDA_SUCCESS
)
2036 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2038 r
= cuCtxSynchronize ();
2039 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2040 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2042 else if (r
!= CUDA_SUCCESS
)
2043 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2044 nvptx_stacks_free (stacks
, teams
* threads
);
2048 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
2051 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");