1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
41 #include <hsa_ext_amd.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
48 #include "oacc-plugin.h"
52 /* These probably won't be in elf.h for a while. */
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
93 secure_getenv (const char *name
)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
102 #define secure_getenv getenv
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
123 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
124 const char **status_string
);
125 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
127 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
128 hsa_agent_info_t attribute
,
130 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
131 hsa_isa_info_t attribute
,
134 hsa_status_t (*hsa_init_fn
) (void);
135 hsa_status_t (*hsa_iterate_agents_fn
)
136 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
137 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
138 hsa_region_info_t attribute
,
140 hsa_status_t (*hsa_queue_create_fn
)
141 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
142 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
143 void *data
, uint32_t private_segment_size
,
144 uint32_t group_segment_size
, hsa_queue_t
**queue
);
145 hsa_status_t (*hsa_agent_iterate_regions_fn
)
147 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
148 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
149 hsa_status_t (*hsa_executable_create_fn
)
150 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
151 const char *options
, hsa_executable_t
*executable
);
152 hsa_status_t (*hsa_executable_global_variable_define_fn
)
153 (hsa_executable_t executable
, const char *variable_name
, void *address
);
154 hsa_status_t (*hsa_executable_load_code_object_fn
)
155 (hsa_executable_t executable
, hsa_agent_t agent
,
156 hsa_code_object_t code_object
, const char *options
);
157 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
158 const char *options
);
159 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
160 uint32_t num_consumers
,
161 const hsa_agent_t
*consumers
,
162 hsa_signal_t
*signal
);
163 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
165 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
166 hsa_access_permission_t access
);
167 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
168 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
169 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
170 hsa_status_t (*hsa_executable_get_symbol_fn
)
171 (hsa_executable_t executable
, const char *module_name
,
172 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
173 hsa_executable_symbol_t
*symbol
);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
175 (hsa_executable_symbol_t executable_symbol
,
176 hsa_executable_symbol_info_t attribute
, void *value
);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
178 (hsa_executable_t executable
,
179 hsa_status_t (*callback
)(hsa_executable_t executable
,
180 hsa_executable_symbol_t symbol
, void *data
),
182 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
184 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
185 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
186 hsa_signal_value_t value
);
187 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
188 hsa_signal_value_t value
);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
190 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
191 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
192 hsa_wait_state_t wait_state_hint
);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
194 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
196 hsa_status_t (*hsa_code_object_deserialize_fn
)
197 (void *serialized_code_object
, size_t serialized_code_object_size
,
198 const char *options
, hsa_code_object_t
*code_object
);
199 hsa_status_t (*hsa_amd_memory_lock_fn
)
200 (void *host_ptr
, size_t size
, hsa_agent_t
*agents
, int num_agent
,
202 hsa_status_t (*hsa_amd_memory_unlock_fn
) (void *host_ptr
);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn
)
204 (const hsa_pitched_ptr_t
*dst
, const hsa_dim3_t
*dst_offset
,
205 const hsa_pitched_ptr_t
*src
, const hsa_dim3_t
*src_offset
,
206 const hsa_dim3_t
*range
, hsa_agent_t copy_agent
,
207 hsa_amd_copy_direction_t dir
, uint32_t num_dep_signals
,
208 const hsa_signal_t
*dep_signals
, hsa_signal_t completion_signal
);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
218 /* Size of the grid in the three respective dimensions. */
220 /* Size of work-groups in the respective dimensions. */
224 /* Collection of information needed for a dispatch of a kernel from a
227 struct kernel_dispatch
229 struct agent_info
*agent
;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address
;
236 /* Synchronization signal used for dispatch synchronization. */
238 /* Private segment size. */
239 uint32_t private_segment_size
;
240 /* Group segment size. */
241 uint32_t group_segment_size
;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
250 struct kernargs_abi abi
;
253 struct output output_data
;
256 /* A queue entry for a future asynchronous launch. */
260 struct kernel_info
*kernel
;
262 struct GOMP_kernel_launch_attributes kla
;
265 /* A queue entry for a future callback. */
273 /* A data struct for the copy_data callback. */
280 struct goacc_asyncqueue
*aq
;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
289 pthread_mutex_t mutex
;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder
*placeholderp
;
299 /* Encode the type of an entry in an async queue. */
309 /* An entry in an async queue. */
313 enum entry_type type
;
315 struct kernel_launch launch
;
316 struct callback callback
;
317 struct asyncwait_info asyncwait
;
318 struct placeholder placeholder
;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info
*agent
;
330 hsa_queue_t
*hsa_queue
;
332 pthread_t thread_drain_queue
;
333 pthread_mutex_t mutex
;
334 pthread_cond_t queue_cond_in
;
335 pthread_cond_t queue_cond_out
;
336 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
339 int drain_queue_stop
;
342 struct goacc_asyncqueue
*prev
;
343 struct goacc_asyncqueue
*next
;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
354 int oacc_dims
[3]; /* Only present for GCN kernels. */
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
376 const unsigned kernel_count
;
377 struct hsa_kernel_description
*kernel_infos
;
378 const unsigned ind_func_count
;
379 const unsigned global_variable_count
;
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
387 EF_AMDGPU_MACH_UNSUPPORTED
= -1,
388 EF_AMDGPU_MACH_AMDGCN_GFX803
= 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900
= 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906
= 0x02f,
391 EF_AMDGPU_MACH_AMDGCN_GFX908
= 0x030,
392 EF_AMDGPU_MACH_AMDGCN_GFX90a
= 0x03f,
393 EF_AMDGPU_MACH_AMDGCN_GFX1030
= 0x036,
394 EF_AMDGPU_MACH_AMDGCN_GFX1100
= 0x041
397 const static int EF_AMDGPU_MACH_MASK
= 0x000000ff;
398 typedef EF_AMDGPU_MACH gcn_isa
;
400 /* Description of an HSA GPU agent (device) and the program associated with
405 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
407 /* The user-visible device number. */
409 /* Whether the agent has been initialized. The fields below are usable only
413 /* The instruction set architecture of the device. */
415 /* Name of the agent. */
417 /* Name of the vendor of the agent. */
418 char vendor_name
[64];
419 /* Command queues of the agent. */
420 hsa_queue_t
*sync_queue
;
421 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
422 pthread_mutex_t async_queues_mutex
;
424 /* The HSA memory region from which to allocate kernel arguments. */
425 hsa_region_t kernarg_region
;
427 /* The HSA memory region from which to allocate device data. */
428 hsa_region_t data_region
;
430 /* Allocated ephemeral memories (team arena and stack space). */
431 struct ephemeral_memories_list
*ephemeral_memories_list
;
432 pthread_mutex_t ephemeral_memories_write_lock
;
434 /* Read-write lock that protects kernels which are running or about to be run
435 from interference with loading and unloading of images. Needs to be
436 locked for reading while a kernel is being run, and for writing if the
437 list of modules is manipulated (and thus the HSA program invalidated). */
438 pthread_rwlock_t module_rwlock
;
440 /* The module associated with this kernel. */
441 struct module_info
*module
;
443 /* Mutex enforcing that only one thread will finalize the HSA program. A
444 thread should have locked agent->module_rwlock for reading before
446 pthread_mutex_t prog_mutex
;
447 /* Flag whether the HSA program that consists of all the modules has been
450 /* HSA executable - the finalized program that is used to locate kernels. */
451 hsa_executable_t executable
;
454 /* Information required to identify, finalize and run any given kernel. */
456 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
460 /* Name of the kernel, required to locate it within the GCN object-code
463 /* The specific agent the kernel has been or will be finalized for and run
465 struct agent_info
*agent
;
466 /* The specific module where the kernel takes place. */
467 struct module_info
*module
;
468 /* Information provided by mkoffload associated with the kernel. */
469 struct hsa_kernel_description
*description
;
470 /* Mutex enforcing that at most once thread ever initializes a kernel for
471 use. A thread should have locked agent->module_rwlock for reading before
473 pthread_mutex_t init_mutex
;
474 /* Flag indicating whether the kernel has been initialized and all fields
475 below it contain valid data. */
477 /* Flag indicating that the kernel has a problem that blocks an execution. */
478 bool initialization_failed
;
479 /* The object to be put into the dispatch queue. */
481 /* Required size of kernel arguments. */
482 uint32_t kernarg_segment_size
;
483 /* Required size of group segment. */
484 uint32_t group_segment_size
;
485 /* Required size of private segment. */
486 uint32_t private_segment_size
;
487 /* Set up for OpenMP or OpenACC? */
488 enum offload_kind kind
;
491 /* Information about a particular GCN module, its image and kernels. */
495 /* The description with which the program has registered the image. */
496 struct gcn_image_desc
*image_desc
;
497 /* GCN heap allocation. */
499 /* Physical boundaries of the loaded module. */
500 Elf64_Addr phys_address_start
;
501 Elf64_Addr phys_address_end
;
503 bool constructors_run_p
;
504 struct kernel_info
*init_array_func
, *fini_array_func
;
506 /* Number of kernels in this module. */
508 /* An array of kernel_info structures describing each kernel in this
510 struct kernel_info kernels
[];
513 /* A linked list of memory arenas allocated on the device.
514 These are used by OpenMP, as a means to optimize per-team malloc,
515 and for host-accessible stack space. */
517 struct ephemeral_memories_list
519 struct ephemeral_memories_list
*next
;
521 /* The size is determined by the number of teams and threads. */
523 /* The device address allocated memory. */
525 /* A flag to prevent two asynchronous kernels trying to use the same memory.
526 The mutex is locked until the kernel exits. */
527 pthread_mutex_t in_use
;
530 /* Information about the whole HSA environment and all of its agents. */
532 struct hsa_context_info
534 /* Whether the structure has been initialized. */
536 /* Number of usable GPU HSA agents in the system. */
538 /* Array of agent_info structures describing the individual HSA agents. */
539 struct agent_info
*agents
;
540 /* Driver version string. */
541 char driver_version_s
[30];
545 /* {{{ Global variables */
547 /* Information about the whole HSA environment and all of its agents. */
549 static struct hsa_context_info hsa_context
;
551 /* HSA runtime functions that are initialized in init_hsa_context. */
553 static struct hsa_runtime_fn_info hsa_fns
;
555 /* Heap space, allocated target-side, provided for use of newlib malloc.
556 Each module should have it's own heap allocated.
557 Beware that heap usage increases with OpenMP teams. See also arenas. */
559 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
561 /* Ephemeral memory sizes for each kernel launch. */
563 static int team_arena_size
= DEFAULT_TEAM_ARENA_SIZE
;
564 static int stack_size
= DEFAULT_GCN_STACK_SIZE
;
565 static int lowlat_size
= -1;
567 /* Flag to decide whether print to stderr information about what is going on.
568 Set in init_debug depending on environment variables. */
572 /* Flag to decide if the runtime should suppress a possible fallback to host
575 static bool suppress_host_fallback
;
577 /* Flag to locate HSA runtime shared library that is dlopened
580 static const char *hsa_runtime_lib
;
582 /* Flag to decide if the runtime should support also CPU devices (can be
585 static bool support_cpu_devices
;
587 /* Runtime dimension overrides. Zero indicates default. */
589 static int override_x_dim
= 0;
590 static int override_z_dim
= 0;
593 /* {{{ Debug & Diagnostic */
595 /* Print a message to stderr if GCN_DEBUG value is set to true. */
597 #define DEBUG_PRINT(...) \
602 fprintf (stderr, __VA_ARGS__); \
607 /* Flush stderr if GCN_DEBUG value is set to true. */
609 #define DEBUG_FLUSH() \
615 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
618 #define DEBUG_LOG(prefix, ...) \
621 DEBUG_PRINT (prefix); \
622 DEBUG_PRINT (__VA_ARGS__); \
626 /* Print a debugging message to stderr. */
628 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
630 /* Print a warning message to stderr. */
632 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
634 /* Print HSA warning STR with an HSA STATUS code. */
637 hsa_warn (const char *str
, hsa_status_t status
)
642 const char *hsa_error_msg
= "[unknown]";
643 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
645 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
649 /* Report a fatal error STR together with the HSA error corresponding to STATUS
650 and terminate execution of the current process. */
653 hsa_fatal (const char *str
, hsa_status_t status
)
655 const char *hsa_error_msg
= "[unknown]";
656 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
657 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
661 /* Like hsa_fatal, except only report error message, and return FALSE
662 for propagating error processing to outside of plugin. */
665 hsa_error (const char *str
, hsa_status_t status
)
667 const char *hsa_error_msg
= "[unknown]";
668 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
669 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
674 /* Dump information about the available hardware. */
677 dump_hsa_system_info (void)
681 hsa_endianness_t endianness
;
682 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
684 if (status
== HSA_STATUS_SUCCESS
)
687 case HSA_ENDIANNESS_LITTLE
:
688 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
690 case HSA_ENDIANNESS_BIG
:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
694 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
699 uint8_t extensions
[128];
700 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
702 if (status
== HSA_STATUS_SUCCESS
)
704 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
705 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
708 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
711 /* Dump information about the available hardware. */
714 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
716 switch (machine_model
)
718 case HSA_MACHINE_MODEL_SMALL
:
719 GCN_DEBUG ("%s: SMALL\n", s
);
721 case HSA_MACHINE_MODEL_LARGE
:
722 GCN_DEBUG ("%s: LARGE\n", s
);
725 GCN_WARNING ("%s: UNKNOWN\n", s
);
730 /* Dump information about the available hardware. */
733 dump_profile (hsa_profile_t profile
, const char *s
)
737 case HSA_PROFILE_FULL
:
738 GCN_DEBUG ("%s: FULL\n", s
);
740 case HSA_PROFILE_BASE
:
741 GCN_DEBUG ("%s: BASE\n", s
);
744 GCN_WARNING ("%s: UNKNOWN\n", s
);
749 /* Dump information about a device memory region. */
752 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
756 hsa_region_segment_t segment
;
757 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
759 if (status
== HSA_STATUS_SUCCESS
)
761 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
763 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
765 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
766 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
767 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
768 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
770 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
775 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
779 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
781 if (status
== HSA_STATUS_SUCCESS
)
783 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
785 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
786 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
787 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
788 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
791 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
795 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
796 if (status
== HSA_STATUS_SUCCESS
)
797 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
799 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
802 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
804 if (status
== HSA_STATUS_SUCCESS
)
805 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
807 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
811 = hsa_fns
.hsa_region_get_info_fn (region
,
812 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
814 if (status
== HSA_STATUS_SUCCESS
)
815 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
817 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
819 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
820 return HSA_STATUS_SUCCESS
;
823 = hsa_fns
.hsa_region_get_info_fn (region
,
824 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
826 if (status
== HSA_STATUS_SUCCESS
)
827 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
829 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
833 = hsa_fns
.hsa_region_get_info_fn (region
,
834 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
836 if (status
== HSA_STATUS_SUCCESS
)
837 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
839 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
841 return HSA_STATUS_SUCCESS
;
844 /* Dump information about all the device memory regions. */
847 dump_hsa_regions (hsa_agent_t agent
)
850 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
853 if (status
!= HSA_STATUS_SUCCESS
)
854 hsa_error ("Dumping hsa regions failed", status
);
857 /* Dump information about the available devices. */
860 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
865 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
867 if (status
== HSA_STATUS_SUCCESS
)
868 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
870 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
872 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
874 if (status
== HSA_STATUS_SUCCESS
)
875 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
877 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
879 hsa_machine_model_t machine_model
;
881 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
883 if (status
== HSA_STATUS_SUCCESS
)
884 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
886 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
888 hsa_profile_t profile
;
889 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
891 if (status
== HSA_STATUS_SUCCESS
)
892 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
894 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
896 hsa_device_type_t device_type
;
897 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
899 if (status
== HSA_STATUS_SUCCESS
)
903 case HSA_DEVICE_TYPE_CPU
:
904 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
906 case HSA_DEVICE_TYPE_GPU
:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
909 case HSA_DEVICE_TYPE_DSP
:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
913 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
918 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
921 status
= hsa_fns
.hsa_agent_get_info_fn
922 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
923 if (status
== HSA_STATUS_SUCCESS
)
924 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
926 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
929 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
931 if (status
== HSA_STATUS_SUCCESS
)
932 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
934 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
937 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
938 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
940 if (status
== HSA_STATUS_SUCCESS
)
941 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
943 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
946 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
947 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
949 if (status
== HSA_STATUS_SUCCESS
)
950 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
952 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
954 uint32_t grid_max_dim
;
955 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
957 if (status
== HSA_STATUS_SUCCESS
)
958 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
960 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
962 uint32_t grid_max_size
;
963 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
965 if (status
== HSA_STATUS_SUCCESS
)
966 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
968 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
970 dump_hsa_regions (agent
);
972 return HSA_STATUS_SUCCESS
;
975 /* Forward reference. */
977 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
979 /* Helper function for dump_executable_symbols. */
982 dump_executable_symbol (hsa_executable_t executable
,
983 hsa_executable_symbol_t symbol
,
984 void *data
__attribute__((unused
)))
986 char *name
= get_executable_symbol_name (symbol
);
990 GCN_DEBUG ("executable symbol: %s\n", name
);
994 return HSA_STATUS_SUCCESS
;
997 /* Dump all global symbol in an executable. */
1000 dump_executable_symbols (hsa_executable_t executable
)
1002 hsa_status_t status
;
1004 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1005 dump_executable_symbol
,
1007 if (status
!= HSA_STATUS_SUCCESS
)
1008 hsa_fatal ("Could not dump HSA executable symbols", status
);
1011 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1014 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1016 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1018 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1019 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1020 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1021 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1022 (void*)kernargs
->abi
.heap_ptr
);
1023 fprintf (stderr
, "%*sarena address: %p (%d bytes per workgroup)\n", indent
,
1024 "", (void*)kernargs
->abi
.arena_ptr
,
1025 kernargs
->abi
.arena_size_per_team
);
1026 fprintf (stderr
, "%*sstack address: %p (%d bytes per wavefront)\n", indent
,
1027 "", (void*)kernargs
->abi
.stack_ptr
,
1028 kernargs
->abi
.stack_size_per_thread
);
1029 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1030 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1031 dispatch
->private_segment_size
);
1032 fprintf (stderr
, "%*sgroup_segment_size: %u (low-latency pool)\n", indent
,
1033 "", dispatch
->group_segment_size
);
1034 fprintf (stderr
, "\n");
1038 /* {{{ Utility functions */
1040 /* Cast the thread local storage to gcn_thread. */
1042 static inline struct gcn_thread
*
1045 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1048 /* Initialize debug and suppress_host_fallback according to the environment. */
1051 init_environment_variables (void)
1053 if (secure_getenv ("GCN_DEBUG"))
1058 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1059 suppress_host_fallback
= true;
1061 suppress_host_fallback
= false;
1063 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1064 if (hsa_runtime_lib
== NULL
)
1065 hsa_runtime_lib
= "libhsa-runtime64.so.1";
1067 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1069 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1071 x
= secure_getenv ("GCN_NUM_GANGS");
1073 override_x_dim
= atoi (x
);
1075 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1077 z
= secure_getenv ("GCN_NUM_WORKERS");
1079 override_z_dim
= atoi (z
);
1081 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1084 size_t tmp
= atol (heap
);
1086 gcn_kernel_heap_size
= tmp
;
1089 const char *arena
= secure_getenv ("GCN_TEAM_ARENA_SIZE");
1092 int tmp
= atoi (arena
);
1094 team_arena_size
= tmp
;;
1097 const char *stack
= secure_getenv ("GCN_STACK_SIZE");
1100 int tmp
= atoi (stack
);
1105 const char *lowlat
= secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1107 lowlat_size
= atoi (lowlat
);
1110 /* Return malloc'd string with name of SYMBOL. */
1113 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1115 hsa_status_t status
;
1118 const hsa_executable_symbol_info_t info_name_length
1119 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1121 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1123 if (status
!= HSA_STATUS_SUCCESS
)
1125 hsa_error ("Could not get length of symbol name", status
);
1129 res
= GOMP_PLUGIN_malloc (len
+ 1);
1131 const hsa_executable_symbol_info_t info_name
1132 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1134 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1136 if (status
!= HSA_STATUS_SUCCESS
)
1138 hsa_error ("Could not get symbol name", status
);
1148 /* Get the number of GPU Compute Units. */
1151 get_cu_count (struct agent_info
*agent
)
1154 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1155 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1156 if (status
== HSA_STATUS_SUCCESS
)
1159 return 64; /* The usual number for older devices. */
1162 /* Calculate the maximum grid size for OMP threads / OACC workers.
1163 This depends on the kernel's resource usage levels. */
1166 limit_worker_threads (int threads
)
1168 /* FIXME Do something more inteligent here.
1169 GCN can always run 4 threads within a Compute Unit, but
1170 more than that depends on register usage. */
1176 /* This sets the maximum number of teams to twice the number of GPU Compute
1177 Units to avoid memory waste and corresponding memory access faults. */
1180 limit_teams (int teams
, struct agent_info
*agent
)
1182 int max_teams
= 2 * get_cu_count (agent
);
1183 if (teams
> max_teams
)
1188 /* Parse the target attributes INPUT provided by the compiler and return true
1189 if we should run anything all. If INPUT is NULL, fill DEF with default
1190 values, then store INPUT or DEF into *RESULT.
1192 This is used for OpenMP only. */
1195 parse_target_attributes (void **input
,
1196 struct GOMP_kernel_launch_attributes
*def
,
1197 struct GOMP_kernel_launch_attributes
**result
,
1198 struct agent_info
*agent
)
1201 GOMP_PLUGIN_fatal ("No target arguments provided");
1203 bool grid_attrs_found
= false;
1204 bool gcn_dims_found
= false;
1206 int gcn_threads
= 0;
1209 intptr_t id
= (intptr_t) *input
++, val
;
1211 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1212 val
= (intptr_t) *input
++;
1214 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1216 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1218 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1219 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1220 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1222 grid_attrs_found
= true;
1225 else if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
)
1226 == GOMP_TARGET_ARG_DEVICE_ALL
)
1228 gcn_dims_found
= true;
1229 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1231 case GOMP_TARGET_ARG_NUM_TEAMS
:
1232 gcn_teams
= limit_teams (val
, agent
);
1234 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1235 gcn_threads
= limit_worker_threads (val
);
1245 bool gfx900_workaround_p
= false;
1247 if (agent
->device_isa
== EF_AMDGPU_MACH_AMDGCN_GFX900
1248 && gcn_threads
== 0 && override_z_dim
== 0)
1250 gfx900_workaround_p
= true;
1251 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1252 "threads to at most 4 per team.\n");
1253 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1254 "GCN_NUM_THREADS=16\n");
1257 /* Ideally, when a dimension isn't explicitly specified, we should
1258 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1259 In practice, we tune for peak performance on BabelStream, which
1260 for OpenACC is currently 32 threads per CU. */
1262 if (gcn_teams
<= 0 && gcn_threads
<= 0)
1264 /* Set up a reasonable number of teams and threads. */
1265 gcn_threads
= gfx900_workaround_p
? 4 : 16; // 8;
1266 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1267 def
->gdims
[2] = gcn_threads
;
1269 else if (gcn_teams
<= 0 && gcn_threads
> 0)
1271 /* Auto-scale the number of teams with the number of threads. */
1272 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1273 def
->gdims
[2] = gcn_threads
;
1275 else if (gcn_teams
> 0 && gcn_threads
<= 0)
1277 int max_threads
= gfx900_workaround_p
? 4 : 16;
1279 /* Auto-scale the number of threads with the number of teams. */
1280 def
->gdims
[0] = gcn_teams
;
1281 def
->gdims
[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1282 if (def
->gdims
[2] == 0)
1284 else if (def
->gdims
[2] > max_threads
)
1285 def
->gdims
[2] = max_threads
;
1289 def
->gdims
[0] = gcn_teams
;
1290 def
->gdims
[2] = gcn_threads
;
1292 def
->gdims
[1] = 64; /* Each thread is 64 work items wide. */
1293 def
->wdims
[0] = 1; /* Single team per work-group. */
1299 else if (!grid_attrs_found
)
1309 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1313 struct GOMP_kernel_launch_attributes
*kla
;
1314 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1316 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1317 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1319 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1321 for (i
= 0; i
< kla
->ndim
; i
++)
1323 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1324 kla
->gdims
[i
], kla
->wdims
[i
]);
1325 if (kla
->gdims
[i
] == 0)
1331 /* Return the group size given the requested GROUP size, GRID size and number
1332 of grid dimensions NDIM. */
1335 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1339 /* TODO: Provide a default via environment or device characteristics. */
1353 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1356 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1358 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1361 /* A never-called callback for the HSA command queues. These signal events
1362 that we don't use, so we trigger an error.
1364 This "queue" is not to be confused with the async queues, below. */
1367 hsa_queue_callback (hsa_status_t status
,
1368 hsa_queue_t
*queue
__attribute__ ((unused
)),
1369 void *data
__attribute__ ((unused
)))
1371 hsa_fatal ("Asynchronous queue error", status
);
1375 /* {{{ HSA initialization */
1377 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1380 init_hsa_runtime_functions (void)
1382 #define DLSYM_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function); \
1384 if (hsa_fns.function##_fn == NULL) \
1386 #define DLSYM_OPT_FN(function) \
1387 hsa_fns.function##_fn = dlsym (handle, #function);
1388 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1392 DLSYM_FN (hsa_status_string
)
1393 DLSYM_FN (hsa_system_get_info
)
1394 DLSYM_FN (hsa_agent_get_info
)
1396 DLSYM_FN (hsa_iterate_agents
)
1397 DLSYM_FN (hsa_region_get_info
)
1398 DLSYM_FN (hsa_queue_create
)
1399 DLSYM_FN (hsa_agent_iterate_regions
)
1400 DLSYM_FN (hsa_executable_destroy
)
1401 DLSYM_FN (hsa_executable_create
)
1402 DLSYM_FN (hsa_executable_global_variable_define
)
1403 DLSYM_FN (hsa_executable_load_code_object
)
1404 DLSYM_FN (hsa_executable_freeze
)
1405 DLSYM_FN (hsa_signal_create
)
1406 DLSYM_FN (hsa_memory_allocate
)
1407 DLSYM_FN (hsa_memory_assign_agent
)
1408 DLSYM_FN (hsa_memory_copy
)
1409 DLSYM_FN (hsa_memory_free
)
1410 DLSYM_FN (hsa_signal_destroy
)
1411 DLSYM_FN (hsa_executable_get_symbol
)
1412 DLSYM_FN (hsa_executable_symbol_get_info
)
1413 DLSYM_FN (hsa_executable_iterate_symbols
)
1414 DLSYM_FN (hsa_queue_add_write_index_release
)
1415 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1416 DLSYM_FN (hsa_signal_wait_acquire
)
1417 DLSYM_FN (hsa_signal_store_relaxed
)
1418 DLSYM_FN (hsa_signal_store_release
)
1419 DLSYM_FN (hsa_signal_load_acquire
)
1420 DLSYM_FN (hsa_queue_destroy
)
1421 DLSYM_FN (hsa_code_object_deserialize
)
1422 DLSYM_OPT_FN (hsa_amd_memory_lock
)
1423 DLSYM_OPT_FN (hsa_amd_memory_unlock
)
1424 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect
)
1430 static gcn_isa
isa_code (const char *isa
);
1432 /* Return true if the agent is a GPU and can accept of concurrent submissions
1433 from different threads. */
1436 suitable_hsa_agent_p (hsa_agent_t agent
)
1438 hsa_device_type_t device_type
;
1440 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1442 if (status
!= HSA_STATUS_SUCCESS
)
1445 switch (device_type
)
1447 case HSA_DEVICE_TYPE_GPU
:
1451 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
, name
);
1452 if (status
!= HSA_STATUS_SUCCESS
1453 || isa_code (name
) == EF_AMDGPU_MACH_UNSUPPORTED
)
1455 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1456 status
== HSA_STATUS_SUCCESS
? name
: "invalid");
1461 case HSA_DEVICE_TYPE_CPU
:
1462 if (!support_cpu_devices
)
1469 uint32_t features
= 0;
1470 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1472 if (status
!= HSA_STATUS_SUCCESS
1473 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1475 hsa_queue_type_t queue_type
;
1476 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1478 if (status
!= HSA_STATUS_SUCCESS
1479 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1485 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1486 agent_count in hsa_context. */
1489 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1491 if (suitable_hsa_agent_p (agent
))
1492 hsa_context
.agent_count
++;
1493 return HSA_STATUS_SUCCESS
;
1496 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1497 id to the describing structure in the hsa context. The index of the
1498 structure is pointed to by DATA, increment it afterwards. */
1501 assign_agent_ids (hsa_agent_t agent
, void *data
)
1503 if (suitable_hsa_agent_p (agent
))
1505 int *agent_index
= (int *) data
;
1506 hsa_context
.agents
[*agent_index
].id
= agent
;
1509 return HSA_STATUS_SUCCESS
;
1512 /* Initialize hsa_context if it has not already been done.
1513 Return TRUE on success. */
1516 init_hsa_context (void)
1518 hsa_status_t status
;
1519 int agent_index
= 0;
1521 if (hsa_context
.initialized
)
1523 init_environment_variables ();
1524 if (!init_hsa_runtime_functions ())
1526 GCN_WARNING ("Run-time could not be dynamically opened\n");
1527 if (suppress_host_fallback
)
1528 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1531 status
= hsa_fns
.hsa_init_fn ();
1532 if (status
!= HSA_STATUS_SUCCESS
)
1533 return hsa_error ("Run-time could not be initialized", status
);
1534 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1537 dump_hsa_system_info ();
1539 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1540 if (status
!= HSA_STATUS_SUCCESS
)
1541 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1542 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1545 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1546 * sizeof (struct agent_info
));
1547 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1548 if (status
!= HSA_STATUS_SUCCESS
)
1549 return hsa_error ("Scanning compute agents failed", status
);
1550 if (agent_index
!= hsa_context
.agent_count
)
1552 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1558 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1559 if (status
!= HSA_STATUS_SUCCESS
)
1560 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1563 uint16_t minor
, major
;
1564 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR
,
1566 if (status
!= HSA_STATUS_SUCCESS
)
1567 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1568 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR
,
1570 if (status
!= HSA_STATUS_SUCCESS
)
1571 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1573 size_t len
= sizeof hsa_context
.driver_version_s
;
1574 int printed
= snprintf (hsa_context
.driver_version_s
, len
,
1575 "HSA Runtime %hu.%hu", (unsigned short int)major
,
1576 (unsigned short int)minor
);
1578 GCN_WARNING ("HSA runtime version string was truncated."
1579 "Version %hu.%hu is too long.", (unsigned short int)major
,
1580 (unsigned short int)minor
);
1582 hsa_context
.initialized
= true;
1586 /* Verify that hsa_context has already been initialized and return the
1587 agent_info structure describing device number N. Return NULL on error. */
1589 static struct agent_info
*
1590 get_agent_info (int n
)
1592 if (!hsa_context
.initialized
)
1594 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1597 if (n
>= hsa_context
.agent_count
)
1599 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1602 if (!hsa_context
.agents
[n
].initialized
)
1604 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1607 return &hsa_context
.agents
[n
];
1610 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1612 Selects (breaks at) a suitable region of type KIND. */
1615 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1616 hsa_region_global_flag_t kind
)
1618 hsa_status_t status
;
1619 hsa_region_segment_t segment
;
1621 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1623 if (status
!= HSA_STATUS_SUCCESS
)
1625 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1626 return HSA_STATUS_SUCCESS
;
1629 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1631 if (status
!= HSA_STATUS_SUCCESS
)
1636 return HSA_STATUS_INFO_BREAK
;
1638 return HSA_STATUS_SUCCESS
;
1641 /* Callback of hsa_agent_iterate_regions.
1643 Selects a kernargs memory region. */
1646 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1648 return get_memory_region (region
, (hsa_region_t
*)data
,
1649 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1652 /* Callback of hsa_agent_iterate_regions.
1654 Selects a coarse-grained memory region suitable for the heap and
1658 get_data_memory_region (hsa_region_t region
, void *data
)
1660 return get_memory_region (region
, (hsa_region_t
*)data
,
1661 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1665 elf_gcn_isa_field (Elf64_Ehdr
*image
)
1667 return image
->e_flags
& EF_AMDGPU_MACH_MASK
;
1670 const static char *gcn_gfx803_s
= "gfx803";
1671 const static char *gcn_gfx900_s
= "gfx900";
1672 const static char *gcn_gfx906_s
= "gfx906";
1673 const static char *gcn_gfx908_s
= "gfx908";
1674 const static char *gcn_gfx90a_s
= "gfx90a";
1675 const static char *gcn_gfx1030_s
= "gfx1030";
1676 const static char *gcn_gfx1100_s
= "gfx1100";
1677 const static int gcn_isa_name_len
= 7;
1679 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1683 isa_hsa_name (int isa
) {
1686 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1687 return gcn_gfx803_s
;
1688 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1689 return gcn_gfx900_s
;
1690 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1691 return gcn_gfx906_s
;
1692 case EF_AMDGPU_MACH_AMDGCN_GFX908
:
1693 return gcn_gfx908_s
;
1694 case EF_AMDGPU_MACH_AMDGCN_GFX90a
:
1695 return gcn_gfx90a_s
;
1696 case EF_AMDGPU_MACH_AMDGCN_GFX1030
:
1697 return gcn_gfx1030_s
;
1698 case EF_AMDGPU_MACH_AMDGCN_GFX1100
:
1699 return gcn_gfx1100_s
;
1704 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1705 with -march) or NULL if we do not support the ISA.
1706 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1709 isa_gcc_name (int isa
) {
1712 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1715 return isa_hsa_name (isa
);
1719 /* Returns the code which is used in the GCN object code to identify the ISA with
1720 the given name (as used by the HSA runtime). */
1723 isa_code(const char *isa
) {
1724 if (!strncmp (isa
, gcn_gfx803_s
, gcn_isa_name_len
))
1725 return EF_AMDGPU_MACH_AMDGCN_GFX803
;
1727 if (!strncmp (isa
, gcn_gfx900_s
, gcn_isa_name_len
))
1728 return EF_AMDGPU_MACH_AMDGCN_GFX900
;
1730 if (!strncmp (isa
, gcn_gfx906_s
, gcn_isa_name_len
))
1731 return EF_AMDGPU_MACH_AMDGCN_GFX906
;
1733 if (!strncmp (isa
, gcn_gfx908_s
, gcn_isa_name_len
))
1734 return EF_AMDGPU_MACH_AMDGCN_GFX908
;
1736 if (!strncmp (isa
, gcn_gfx90a_s
, gcn_isa_name_len
))
1737 return EF_AMDGPU_MACH_AMDGCN_GFX90a
;
1739 if (!strncmp (isa
, gcn_gfx1030_s
, gcn_isa_name_len
))
1740 return EF_AMDGPU_MACH_AMDGCN_GFX1030
;
1742 if (!strncmp (isa
, gcn_gfx1100_s
, gcn_isa_name_len
))
1743 return EF_AMDGPU_MACH_AMDGCN_GFX1100
;
1745 return EF_AMDGPU_MACH_UNSUPPORTED
;
1748 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1751 max_isa_vgprs (int isa
)
1755 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1756 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1757 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1758 case EF_AMDGPU_MACH_AMDGCN_GFX908
:
1760 case EF_AMDGPU_MACH_AMDGCN_GFX90a
:
1762 case EF_AMDGPU_MACH_AMDGCN_GFX1030
:
1763 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1764 case EF_AMDGPU_MACH_AMDGCN_GFX1100
:
1765 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
1767 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1773 /* Create or reuse a team arena and stack space.
1775 Team arenas are used by OpenMP to avoid calling malloc multiple times
1776 while setting up each team. This is purely a performance optimization.
1778 The stack space is used by all kernels. We must allocate it in such a
1779 way that the reverse offload implmentation can access the data.
1781 Allocating this memory costs performance, so this function will reuse an
1782 existing allocation if a large enough one is idle.
1783 The memory lock is released, but not deallocated, when the kernel exits. */
1786 configure_ephemeral_memories (struct kernel_info
*kernel
,
1787 struct kernargs_abi
*kernargs
, int num_teams
,
1790 struct agent_info
*agent
= kernel
->agent
;
1791 struct ephemeral_memories_list
**next_ptr
= &agent
->ephemeral_memories_list
;
1792 struct ephemeral_memories_list
*item
;
1794 int actual_arena_size
= (kernel
->kind
== KIND_OPENMP
1795 ? team_arena_size
: 0);
1796 int actual_arena_total_size
= actual_arena_size
* num_teams
;
1797 size_t size
= (actual_arena_total_size
1798 + num_teams
* num_threads
* stack_size
);
1800 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1802 if (item
->size
< size
)
1805 if (pthread_mutex_trylock (&item
->in_use
) == 0)
1811 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1812 " (%zd bytes)\n", (actual_arena_size
? "arena and " : ""),
1813 num_teams
, num_threads
, size
);
1815 if (pthread_mutex_lock (&agent
->ephemeral_memories_write_lock
))
1817 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1820 item
= malloc (sizeof (*item
));
1825 if (pthread_mutex_init (&item
->in_use
, NULL
))
1827 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1830 if (pthread_mutex_lock (&item
->in_use
))
1832 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1835 if (pthread_mutex_unlock (&agent
->ephemeral_memories_write_lock
))
1837 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1841 hsa_status_t status
;
1842 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
, size
,
1844 if (status
!= HSA_STATUS_SUCCESS
)
1845 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1846 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->address
, agent
->id
,
1847 HSA_ACCESS_PERMISSION_RW
);
1848 if (status
!= HSA_STATUS_SUCCESS
)
1849 hsa_fatal ("Could not assign arena & stack memory to device", status
);
1852 kernargs
->arena_ptr
= (actual_arena_total_size
1853 ? (uint64_t)item
->address
1855 kernargs
->stack_ptr
= (uint64_t)item
->address
+ actual_arena_total_size
;
1856 kernargs
->arena_size_per_team
= actual_arena_size
;
1857 kernargs
->stack_size_per_thread
= stack_size
;
1860 /* Mark an ephemeral memory space available for reuse. */
1863 release_ephemeral_memories (struct agent_info
* agent
, void *address
)
1865 struct ephemeral_memories_list
*item
;
1867 for (item
= agent
->ephemeral_memories_list
; item
; item
= item
->next
)
1869 if (item
->address
== address
)
1871 if (pthread_mutex_unlock (&item
->in_use
))
1872 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1876 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1879 /* Clean up all the allocated team arenas. */
1882 destroy_ephemeral_memories (struct agent_info
*agent
)
1884 struct ephemeral_memories_list
*item
, *next
;
1886 for (item
= agent
->ephemeral_memories_list
; item
; item
= next
)
1889 hsa_fns
.hsa_memory_free_fn (item
->address
);
1890 if (pthread_mutex_destroy (&item
->in_use
))
1892 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1897 agent
->ephemeral_memories_list
= NULL
;
1902 /* Allocate memory on a specified device. */
1905 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1907 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1910 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1912 if (status
!= HSA_STATUS_SUCCESS
)
1914 hsa_error ("Could not allocate device memory", status
);
1918 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1919 HSA_ACCESS_PERMISSION_RW
);
1920 if (status
!= HSA_STATUS_SUCCESS
)
1922 hsa_error ("Could not assign data memory to device", status
);
1926 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1927 bool profiling_dispatch_p
1928 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1929 if (profiling_dispatch_p
)
1931 acc_prof_info
*prof_info
= thr
->prof_info
;
1932 acc_event_info data_event_info
;
1933 acc_api_info
*api_info
= thr
->api_info
;
1935 prof_info
->event_type
= acc_ev_alloc
;
1937 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1938 data_event_info
.data_event
.valid_bytes
1939 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1940 data_event_info
.data_event
.parent_construct
1941 = acc_construct_parallel
;
1942 data_event_info
.data_event
.implicit
= 1;
1943 data_event_info
.data_event
.tool_info
= NULL
;
1944 data_event_info
.data_event
.var_name
= NULL
;
1945 data_event_info
.data_event
.bytes
= size
;
1946 data_event_info
.data_event
.host_ptr
= NULL
;
1947 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1949 api_info
->device_api
= acc_device_api_other
;
1951 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1958 /* Create kernel dispatch data structure for given KERNEL, along with
1959 the necessary device signals and memory allocations. */
1961 static struct kernel_dispatch
*
1962 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
,
1965 struct agent_info
*agent
= kernel
->agent
;
1966 struct kernel_dispatch
*shadow
1967 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1969 shadow
->agent
= kernel
->agent
;
1970 shadow
->object
= kernel
->object
;
1972 hsa_signal_t sync_signal
;
1973 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
1974 if (status
!= HSA_STATUS_SUCCESS
)
1975 hsa_fatal ("Error creating the GCN sync signal", status
);
1977 shadow
->signal
= sync_signal
.handle
;
1978 shadow
->private_segment_size
= kernel
->private_segment_size
;
1980 if (lowlat_size
< 0)
1982 /* Divide the LDS between the number of running teams.
1983 Allocate not less than is defined in the kernel metadata. */
1984 int teams_per_cu
= num_teams
/ get_cu_count (agent
);
1985 int LDS_per_team
= (teams_per_cu
? 65536 / teams_per_cu
: 65536);
1986 shadow
->group_segment_size
1987 = (kernel
->group_segment_size
> LDS_per_team
1988 ? kernel
->group_segment_size
1991 else if (lowlat_size
< GCN_LOWLAT_HEAP
+8)
1992 /* Ensure that there's space for the OpenMP libgomp data. */
1993 shadow
->group_segment_size
= GCN_LOWLAT_HEAP
+8;
1995 shadow
->group_segment_size
= (lowlat_size
> 65536
1999 /* We expect kernels to request a single pointer, explicitly, and the
2000 rest of struct kernargs, implicitly. If they request anything else
2001 then something is wrong. */
2002 if (kernel
->kernarg_segment_size
> 8)
2004 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2008 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
2009 sizeof (struct kernargs
),
2010 &shadow
->kernarg_address
);
2011 if (status
!= HSA_STATUS_SUCCESS
)
2012 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
2013 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2015 /* Zero-initialize the output_data (minimum needed). */
2016 kernargs
->abi
.out_ptr
= (int64_t)&kernargs
->output_data
;
2017 kernargs
->output_data
.next_output
= 0;
2018 for (unsigned i
= 0;
2019 i
< (sizeof (kernargs
->output_data
.queue
)
2020 / sizeof (kernargs
->output_data
.queue
[0]));
2022 kernargs
->output_data
.queue
[i
].written
= 0;
2023 kernargs
->output_data
.consumed
= 0;
2025 /* Pass in the heap location. */
2026 kernargs
->abi
.heap_ptr
= (int64_t)kernel
->module
->heap
;
2028 /* Create the ephemeral memory spaces. */
2029 configure_ephemeral_memories (kernel
, &kernargs
->abi
, num_teams
, num_threads
);
2031 /* Ensure we can recognize unset return values. */
2032 kernargs
->output_data
.return_value
= 0xcafe0000;
2038 process_reverse_offload (uint64_t fn
, uint64_t mapnum
, uint64_t hostaddrs
,
2039 uint64_t sizes
, uint64_t kinds
, uint64_t dev_num64
)
2041 int dev_num
= dev_num64
;
2042 GOMP_PLUGIN_target_rev (fn
, mapnum
, hostaddrs
, sizes
, kinds
, dev_num
,
2046 /* Output any data written to console output from the kernel. It is expected
2047 that this function is polled during kernel execution.
2049 We print all entries from the last item printed to the next entry without
2050 a "written" flag. If the "final" flag is set then it'll continue right to
2053 The print buffer is circular, but the from and to locations don't wrap when
2054 the buffer does, so the output limit is UINT_MAX. The target blocks on
2055 output when the buffer is full. */
2058 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
2061 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
2062 / sizeof (kernargs
->output_data
.queue
[0]));
2064 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
2066 unsigned int to
= kernargs
->output_data
.next_output
;
2072 printf ("GCN print buffer overflowed.\n");
2077 for (i
= from
; i
< to
; i
++)
2079 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
2081 if (!data
->written
&& !final
)
2086 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
2087 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
2088 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
2089 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
2091 process_reverse_offload (data
->value_u64
[0], data
->value_u64
[1],
2092 data
->value_u64
[2], data
->value_u64
[3],
2093 data
->value_u64
[4], data
->value_u64
[5]);
2095 default: printf ("GCN print buffer error!\n"); break;
2098 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
2104 /* Release data structure created for a kernel dispatch in SHADOW argument,
2105 and clean up the signal and memory allocations. */
2108 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
2110 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
2112 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2113 void *addr
= (void *)kernargs
->abi
.arena_ptr
;
2115 addr
= (void *)kernargs
->abi
.stack_ptr
;
2116 release_ephemeral_memories (shadow
->agent
, addr
);
2118 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
2121 s
.handle
= shadow
->signal
;
2122 hsa_fns
.hsa_signal_destroy_fn (s
);
2127 /* Extract the properties from a kernel binary. */
2130 init_kernel_properties (struct kernel_info
*kernel
)
2132 hsa_status_t status
;
2133 struct agent_info
*agent
= kernel
->agent
;
2134 hsa_executable_symbol_t kernel_symbol
;
2135 char *buf
= alloca (strlen (kernel
->name
) + 4);
2136 sprintf (buf
, "%s.kd", kernel
->name
);
2137 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
2140 if (status
!= HSA_STATUS_SUCCESS
)
2142 hsa_warn ("Could not find symbol for kernel in the code object", status
);
2143 fprintf (stderr
, "not found name: '%s'\n", buf
);
2144 dump_executable_symbols (agent
->executable
);
2147 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
2148 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2149 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
2150 if (status
!= HSA_STATUS_SUCCESS
)
2151 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
2152 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2153 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
2154 &kernel
->kernarg_segment_size
);
2155 if (status
!= HSA_STATUS_SUCCESS
)
2156 hsa_fatal ("Could not get info about kernel argument size", status
);
2157 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2158 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
2159 &kernel
->group_segment_size
);
2160 if (status
!= HSA_STATUS_SUCCESS
)
2161 hsa_fatal ("Could not get info about kernel group segment size", status
);
2162 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2163 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
2164 &kernel
->private_segment_size
);
2165 if (status
!= HSA_STATUS_SUCCESS
)
2166 hsa_fatal ("Could not get info about kernel private segment size",
2169 /* The kernel type is not known until something tries to launch it. */
2170 kernel
->kind
= KIND_UNKNOWN
;
2172 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2173 "following segment sizes: \n", kernel
->name
);
2174 GCN_DEBUG (" group_segment_size: %u\n",
2175 (unsigned) kernel
->group_segment_size
);
2176 GCN_DEBUG (" private_segment_size: %u\n",
2177 (unsigned) kernel
->private_segment_size
);
2178 GCN_DEBUG (" kernarg_segment_size: %u\n",
2179 (unsigned) kernel
->kernarg_segment_size
);
2183 kernel
->initialization_failed
= true;
2186 /* Do all the work that is necessary before running KERNEL for the first time.
2187 The function assumes the program has been created, finalized and frozen by
2188 create_and_finalize_hsa_program. */
2191 init_kernel (struct kernel_info
*kernel
)
2193 if (pthread_mutex_lock (&kernel
->init_mutex
))
2194 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2195 if (kernel
->initialized
)
2197 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2198 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2204 init_kernel_properties (kernel
);
2206 if (!kernel
->initialization_failed
)
2210 kernel
->initialized
= true;
2212 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2213 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2217 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2218 launch attributes from KLA.
2220 MODULE_LOCKED indicates that the caller already holds the lock and
2221 run_kernel need not lock it again.
2222 If AQ is NULL then agent->sync_queue will be used. */
2225 run_kernel (struct kernel_info
*kernel
, void *vars
,
2226 struct GOMP_kernel_launch_attributes
*kla
,
2227 struct goacc_asyncqueue
*aq
, bool module_locked
)
2229 struct agent_info
*agent
= kernel
->agent
;
2230 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel
->description
->sgpr_count
,
2231 kernel
->description
->vpgr_count
);
2233 /* Reduce the number of threads/workers if there are insufficient
2234 VGPRs available to run the kernels together. */
2235 if (kla
->ndim
== 3 && kernel
->description
->vpgr_count
> 0)
2237 int max_vgprs
= max_isa_vgprs (agent
->device_isa
);
2238 int granulated_vgprs
= (kernel
->description
->vpgr_count
+ 3) & ~3;
2239 int max_threads
= (max_vgprs
/ granulated_vgprs
) * 4;
2240 if (kla
->gdims
[2] > max_threads
)
2242 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2243 " per team/gang - reducing to %d threads/workers.\n",
2244 kla
->gdims
[2], max_threads
);
2245 kla
->gdims
[2] = max_threads
;
2249 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
2251 GCN_DEBUG ("GCN launch attribs: gdims:[");
2253 for (i
= 0; i
< kla
->ndim
; ++i
)
2257 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2259 DEBUG_PRINT ("], normalized gdims:[");
2260 for (i
= 0; i
< kla
->ndim
; ++i
)
2264 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2266 DEBUG_PRINT ("], wdims:[");
2267 for (i
= 0; i
< kla
->ndim
; ++i
)
2271 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2273 DEBUG_PRINT ("]\n");
2276 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2277 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2279 if (!agent
->initialized
)
2280 GOMP_PLUGIN_fatal ("Agent must be initialized");
2282 if (!kernel
->initialized
)
2283 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2285 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2288 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2289 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2291 /* Wait until the queue is not full before writing the packet. */
2292 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2296 /* Do not allow the dimensions to be overridden when running
2297 constructors or destructors. */
2298 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2299 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2301 hsa_kernel_dispatch_packet_t
*packet
;
2302 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2303 + index
% command_q
->size
;
2305 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2306 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2307 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2308 packet
->grid_size_x
,
2313 packet
->grid_size_y
= kla
->gdims
[1];
2314 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2319 packet
->grid_size_y
= 1;
2320 packet
->workgroup_size_y
= 1;
2325 packet
->grid_size_z
= limit_worker_threads (override_z
2327 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2328 packet
->grid_size_z
,
2333 packet
->grid_size_z
= 1;
2334 packet
->workgroup_size_z
= 1;
2337 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2338 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2339 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2340 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2341 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2342 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2343 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2344 packet
->workgroup_size_z
);
2346 struct kernel_dispatch
*shadow
2347 = create_kernel_dispatch (kernel
, packet
->grid_size_x
,
2348 packet
->grid_size_z
);
2349 shadow
->queue
= command_q
;
2353 fprintf (stderr
, "\nKernel has following dependencies:\n");
2354 print_kernel_dispatch (shadow
, 2);
2357 packet
->private_segment_size
= shadow
->private_segment_size
;
2358 packet
->group_segment_size
= shadow
->group_segment_size
;
2359 packet
->kernel_object
= shadow
->object
;
2360 packet
->kernarg_address
= shadow
->kernarg_address
;
2362 s
.handle
= shadow
->signal
;
2363 packet
->completion_signal
= s
;
2364 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2365 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2367 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2370 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2371 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2372 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2374 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2377 packet_store_release ((uint32_t *) packet
, header
,
2378 (uint16_t) kla
->ndim
2379 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2381 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2384 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2386 /* Root signal waits with 1ms timeout. */
2387 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2389 HSA_WAIT_STATE_BLOCKED
) != 0)
2391 console_output (kernel
, shadow
->kernarg_address
, false);
2393 console_output (kernel
, shadow
->kernarg_address
, true);
2395 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2396 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2398 release_kernel_dispatch (shadow
);
2400 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2401 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2403 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2404 if (upper
== 0xcafe)
2405 ; // exit not called, normal termination.
2406 else if (upper
== 0xffff)
2410 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2411 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2416 if (upper
== 0xffff)
2418 unsigned int signal
= (return_value
>> 8) & 0xff;
2420 if (signal
== SIGABRT
)
2422 GCN_WARNING ("GCN Kernel aborted\n");
2425 else if (signal
!= 0)
2427 GCN_WARNING ("GCN Kernel received unknown signal\n");
2431 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2432 exit (return_value
& 0xff);
2437 /* {{{ Load/Unload */
2439 /* Initialize KERNEL from D and other parameters. Return true on success. */
2442 init_basic_kernel_info (struct kernel_info
*kernel
,
2443 struct hsa_kernel_description
*d
,
2444 struct agent_info
*agent
,
2445 struct module_info
*module
)
2447 kernel
->agent
= agent
;
2448 kernel
->module
= module
;
2449 kernel
->name
= d
->name
;
2450 kernel
->description
= d
;
2451 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2453 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2459 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2462 isa_matches_agent (struct agent_info
*agent
, Elf64_Ehdr
*image
)
2464 int isa_field
= elf_gcn_isa_field (image
);
2465 const char* isa_s
= isa_hsa_name (isa_field
);
2468 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR
);
2472 if (isa_field
!= agent
->device_isa
)
2475 const char *agent_isa_s
= isa_hsa_name (agent
->device_isa
);
2476 const char *agent_isa_gcc_s
= isa_gcc_name (agent
->device_isa
);
2477 assert (agent_isa_s
);
2478 assert (agent_isa_gcc_s
);
2480 snprintf (msg
, sizeof msg
,
2481 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2482 "Try to recompile with '-foffload-options=-march=%s'.\n",
2483 isa_s
, agent_isa_s
, agent_isa_gcc_s
);
2485 hsa_error (msg
, HSA_STATUS_ERROR
);
2492 /* Create and finalize the program consisting of all loaded modules. */
2495 create_and_finalize_hsa_program (struct agent_info
*agent
)
2497 hsa_status_t status
;
2499 if (pthread_mutex_lock (&agent
->prog_mutex
))
2501 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2504 if (agent
->prog_finalized
)
2508 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2509 HSA_EXECUTABLE_STATE_UNFROZEN
,
2510 "", &agent
->executable
);
2511 if (status
!= HSA_STATUS_SUCCESS
)
2513 hsa_error ("Could not create GCN executable", status
);
2517 /* Load any GCN modules. */
2518 struct module_info
*module
= agent
->module
;
2521 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2523 if (!isa_matches_agent (agent
, image
))
2526 hsa_code_object_t co
= { 0 };
2527 status
= hsa_fns
.hsa_code_object_deserialize_fn
2528 (module
->image_desc
->gcn_image
->image
,
2529 module
->image_desc
->gcn_image
->size
,
2531 if (status
!= HSA_STATUS_SUCCESS
)
2533 hsa_error ("Could not deserialize GCN code object", status
);
2537 status
= hsa_fns
.hsa_executable_load_code_object_fn
2538 (agent
->executable
, agent
->id
, co
, "");
2539 if (status
!= HSA_STATUS_SUCCESS
)
2541 hsa_error ("Could not load GCN code object", status
);
2547 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2548 gcn_kernel_heap_size
,
2549 (void**)&module
->heap
);
2550 if (status
!= HSA_STATUS_SUCCESS
)
2552 hsa_error ("Could not allocate memory for GCN heap", status
);
2556 status
= hsa_fns
.hsa_memory_assign_agent_fn
2557 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2558 if (status
!= HSA_STATUS_SUCCESS
)
2560 hsa_error ("Could not assign GCN heap memory to device", status
);
2564 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2565 &gcn_kernel_heap_size
,
2566 sizeof (gcn_kernel_heap_size
));
2572 dump_executable_symbols (agent
->executable
);
2574 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2575 if (status
!= HSA_STATUS_SUCCESS
)
2577 hsa_error ("Could not freeze the GCN executable", status
);
2582 agent
->prog_finalized
= true;
2584 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2586 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2597 /* Free the HSA program in agent and everything associated with it and set
2598 agent->prog_finalized and the initialized flags of all kernels to false.
2599 Return TRUE on success. */
2602 destroy_hsa_program (struct agent_info
*agent
)
2604 if (!agent
->prog_finalized
)
2607 hsa_status_t status
;
2609 GCN_DEBUG ("Destroying the current GCN program.\n");
2611 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2612 if (status
!= HSA_STATUS_SUCCESS
)
2613 return hsa_error ("Could not destroy GCN executable", status
);
2618 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2619 agent
->module
->kernels
[i
].initialized
= false;
2621 if (agent
->module
->heap
)
2623 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2624 agent
->module
->heap
= NULL
;
2627 agent
->prog_finalized
= false;
2631 /* Deinitialize all information associated with MODULE and kernels within
2632 it. Return TRUE on success. */
2635 destroy_module (struct module_info
*module
, bool locked
)
2637 /* Run destructors before destroying module. */
2638 struct GOMP_kernel_launch_attributes kla
=
2642 /* Work-group size. */
2646 if (module
->fini_array_func
)
2648 init_kernel (module
->fini_array_func
);
2649 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2651 module
->constructors_run_p
= false;
2654 for (i
= 0; i
< module
->kernel_count
; i
++)
2655 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2657 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2668 /* Callback of dispatch queues to report errors. */
2671 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2673 struct queue_entry
*entry
= &aq
->queue
[index
];
2675 switch (entry
->type
)
2679 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2680 aq
->agent
->device_id
, aq
->id
, index
);
2681 run_kernel (entry
->u
.launch
.kernel
,
2682 entry
->u
.launch
.vars
,
2683 &entry
->u
.launch
.kla
, aq
, false);
2685 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2686 aq
->agent
->device_id
, aq
->id
, index
);
2691 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2692 aq
->agent
->device_id
, aq
->id
, index
);
2693 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2695 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2696 aq
->agent
->device_id
, aq
->id
, index
);
2701 /* FIXME: is it safe to access a placeholder that may already have
2703 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2706 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2707 aq
->agent
->device_id
, aq
->id
, index
);
2709 pthread_mutex_lock (&placeholderp
->mutex
);
2711 while (!placeholderp
->executed
)
2712 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2714 pthread_mutex_unlock (&placeholderp
->mutex
);
2716 if (pthread_cond_destroy (&placeholderp
->cond
))
2717 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2719 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2720 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2723 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2724 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2728 case ASYNC_PLACEHOLDER
:
2729 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2730 entry
->u
.placeholder
.executed
= 1;
2731 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2732 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2736 GOMP_PLUGIN_fatal ("Unknown queue element");
2740 /* This function is run as a thread to service an async queue in the
2741 background. It runs continuously until the stop flag is set. */
2744 drain_queue (void *thread_arg
)
2746 struct goacc_asyncqueue
*aq
= thread_arg
;
2748 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2750 aq
->drain_queue_stop
= 2;
2754 pthread_mutex_lock (&aq
->mutex
);
2758 if (aq
->drain_queue_stop
)
2761 if (aq
->queue_n
> 0)
2763 pthread_mutex_unlock (&aq
->mutex
);
2764 execute_queue_entry (aq
, aq
->queue_first
);
2766 pthread_mutex_lock (&aq
->mutex
);
2767 aq
->queue_first
= ((aq
->queue_first
+ 1)
2768 % ASYNC_QUEUE_SIZE
);
2771 if (DEBUG_THREAD_SIGNAL
)
2772 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2773 aq
->agent
->device_id
, aq
->id
);
2774 pthread_cond_broadcast (&aq
->queue_cond_out
);
2775 pthread_mutex_unlock (&aq
->mutex
);
2778 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2780 pthread_mutex_lock (&aq
->mutex
);
2784 if (DEBUG_THREAD_SLEEP
)
2785 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2786 aq
->agent
->device_id
, aq
->id
);
2787 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2788 if (DEBUG_THREAD_SLEEP
)
2789 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2790 aq
->agent
->device_id
, aq
->id
);
2794 aq
->drain_queue_stop
= 2;
2795 if (DEBUG_THREAD_SIGNAL
)
2796 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2797 aq
->agent
->device_id
, aq
->id
);
2798 pthread_cond_broadcast (&aq
->queue_cond_out
);
2799 pthread_mutex_unlock (&aq
->mutex
);
2801 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2805 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2806 is not usually the case. This is just a debug tool. */
2809 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2811 pthread_mutex_lock (&aq
->mutex
);
2813 while (aq
->queue_n
> 0)
2815 execute_queue_entry (aq
, aq
->queue_first
);
2817 aq
->queue_first
= ((aq
->queue_first
+ 1)
2818 % ASYNC_QUEUE_SIZE
);
2822 pthread_mutex_unlock (&aq
->mutex
);
2825 /* Block the current thread until an async queue is writable. The aq->mutex
2826 lock should be held on entry, and remains locked on exit. */
2829 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2831 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2833 /* Queue is full. Wait for it to not be full. */
2834 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2835 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2839 /* Request an asynchronous kernel launch on the specified queue. This
2840 may block if the queue is full, but returns without waiting for the
2844 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2845 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2847 assert (aq
->agent
== kernel
->agent
);
2849 pthread_mutex_lock (&aq
->mutex
);
2851 wait_for_queue_nonfull (aq
);
2853 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2854 % ASYNC_QUEUE_SIZE
);
2856 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2857 aq
->id
, queue_last
);
2859 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2860 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2861 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2862 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2866 if (DEBUG_THREAD_SIGNAL
)
2867 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2868 aq
->agent
->device_id
, aq
->id
);
2869 pthread_cond_signal (&aq
->queue_cond_in
);
2871 pthread_mutex_unlock (&aq
->mutex
);
2874 /* Request an asynchronous callback on the specified queue. The callback
2875 function will be called, with the given opaque data, from the appropriate
2876 async thread, when all previous items on that queue are complete. */
2879 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2882 pthread_mutex_lock (&aq
->mutex
);
2884 wait_for_queue_nonfull (aq
);
2886 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2887 % ASYNC_QUEUE_SIZE
);
2889 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2890 aq
->id
, queue_last
);
2892 aq
->queue
[queue_last
].type
= CALLBACK
;
2893 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2894 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2898 if (DEBUG_THREAD_SIGNAL
)
2899 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2900 aq
->agent
->device_id
, aq
->id
);
2901 pthread_cond_signal (&aq
->queue_cond_in
);
2903 pthread_mutex_unlock (&aq
->mutex
);
2906 /* Request that a given async thread wait for another thread (unspecified) to
2907 reach the given placeholder. The wait will occur when all previous entries
2908 on the queue are complete. A placeholder is effectively a kind of signal
2909 which simply sets a flag when encountered in a queue. */
2912 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2913 struct placeholder
*placeholderp
)
2915 pthread_mutex_lock (&aq
->mutex
);
2917 wait_for_queue_nonfull (aq
);
2919 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2921 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2922 aq
->id
, queue_last
);
2924 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2925 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2929 if (DEBUG_THREAD_SIGNAL
)
2930 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2931 aq
->agent
->device_id
, aq
->id
);
2932 pthread_cond_signal (&aq
->queue_cond_in
);
2934 pthread_mutex_unlock (&aq
->mutex
);
2937 /* Add a placeholder into an async queue. When the async thread reaches the
2938 placeholder it will set the "executed" flag to true and continue.
2939 Another thread may be waiting on this thread reaching the placeholder. */
2941 static struct placeholder
*
2942 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2944 struct placeholder
*placeholderp
;
2946 pthread_mutex_lock (&aq
->mutex
);
2948 wait_for_queue_nonfull (aq
);
2950 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2952 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2953 aq
->id
, queue_last
);
2955 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2956 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2958 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2960 pthread_mutex_unlock (&aq
->mutex
);
2961 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2964 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2966 pthread_mutex_unlock (&aq
->mutex
);
2967 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2970 placeholderp
->executed
= 0;
2974 if (DEBUG_THREAD_SIGNAL
)
2975 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2976 aq
->agent
->device_id
, aq
->id
);
2977 pthread_cond_signal (&aq
->queue_cond_in
);
2979 pthread_mutex_unlock (&aq
->mutex
);
2981 return placeholderp
;
2984 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2987 finalize_async_thread (struct goacc_asyncqueue
*aq
)
2989 pthread_mutex_lock (&aq
->mutex
);
2990 if (aq
->drain_queue_stop
== 2)
2992 pthread_mutex_unlock (&aq
->mutex
);
2996 aq
->drain_queue_stop
= 1;
2998 if (DEBUG_THREAD_SIGNAL
)
2999 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3000 aq
->agent
->device_id
, aq
->id
);
3001 pthread_cond_signal (&aq
->queue_cond_in
);
3003 while (aq
->drain_queue_stop
!= 2)
3005 if (DEBUG_THREAD_SLEEP
)
3006 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3007 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
3008 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3009 if (DEBUG_THREAD_SLEEP
)
3010 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3011 aq
->agent
->device_id
, aq
->id
);
3014 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
3016 pthread_mutex_unlock (&aq
->mutex
);
3018 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
3020 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3021 aq
->agent
->device_id
, aq
->id
, strerror (err
));
3022 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
3025 /* Set up an async queue for OpenMP. There will be only one. The
3026 implementation simply uses an OpenACC async queue.
3027 FIXME: is this thread-safe if two threads call this function? */
3030 maybe_init_omp_async (struct agent_info
*agent
)
3032 if (!agent
->omp_async_queue
)
3033 agent
->omp_async_queue
3034 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
3037 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3038 copies from read-only pages. */
3041 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
3043 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
3045 if (status
== HSA_STATUS_SUCCESS
)
3048 /* It appears that the copy fails if the source data is in a read-only page.
3049 We can't detect that easily, so try copying the data to a temporary buffer
3050 and doing the copy again if we got an error above. */
3052 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3053 "[%p:+%d]\n", (void *) src
, (int) len
);
3055 void *src_copy
= malloc (len
);
3056 memcpy (src_copy
, src
, len
);
3057 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
3059 if (status
!= HSA_STATUS_SUCCESS
)
3060 GOMP_PLUGIN_error ("memory copy failed");
3063 /* Copy data to or from a device. This is intended for use as an async
3067 copy_data (void *data_
)
3069 struct copy_data
*data
= (struct copy_data
*)data_
;
3070 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3071 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
3073 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
3077 /* Request an asynchronous data copy, to or from a device, on a given queue.
3078 The event will be registered as a callback. */
3081 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
3085 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3086 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
3087 struct copy_data
*data
3088 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
3093 queue_push_callback (aq
, copy_data
, data
);
3096 /* Return true if the given queue is currently empty. */
3099 queue_empty (struct goacc_asyncqueue
*aq
)
3101 pthread_mutex_lock (&aq
->mutex
);
3102 int res
= aq
->queue_n
== 0 ? 1 : 0;
3103 pthread_mutex_unlock (&aq
->mutex
);
3108 /* Wait for a given queue to become empty. This implements an OpenACC wait
3112 wait_queue (struct goacc_asyncqueue
*aq
)
3114 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
3116 drain_queue_synchronous (aq
);
3120 pthread_mutex_lock (&aq
->mutex
);
3122 while (aq
->queue_n
> 0)
3124 if (DEBUG_THREAD_SLEEP
)
3125 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3126 aq
->agent
->device_id
, aq
->id
);
3127 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3128 if (DEBUG_THREAD_SLEEP
)
3129 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
3133 pthread_mutex_unlock (&aq
->mutex
);
3134 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
3138 /* {{{ OpenACC support */
3140 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3143 gcn_exec (struct kernel_info
*kernel
,
3144 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
3145 struct goacc_asyncqueue
*aq
)
3147 if (!GOMP_OFFLOAD_can_run (kernel
))
3148 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3150 /* If we get here then this must be an OpenACC kernel. */
3151 kernel
->kind
= KIND_OPENACC
;
3153 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3154 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3156 struct hsa_kernel_description
*d
3157 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3158 if (d
->name
== kernel
->name
)
3160 hsa_kernel_desc
= d
;
3165 /* We may have statically-determined dimensions in
3166 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3167 invocation at runtime in dims[]. We allow static dimensions to take
3168 priority over dynamic dimensions when present (non-zero). */
3169 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3170 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3171 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3172 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3173 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3174 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3176 /* Ideally, when a dimension isn't explicitly specified, we should
3177 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3178 In practice, we tune for peak performance on BabelStream, which
3179 for OpenACC is currently 32 threads per CU. */
3180 if (dims
[0] == 0 && dims
[1] == 0)
3182 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3183 number. There isn't really a correct answer for this without a clue
3184 about the problem size, so let's do a reasonable number of workers
3187 dims
[0] = get_cu_count (kernel
->agent
) * 4; /* Gangs. */
3188 dims
[1] = 8; /* Workers. */
3190 else if (dims
[0] == 0 && dims
[1] > 0)
3192 /* Auto-scale the number of gangs with the requested number of workers. */
3193 dims
[0] = get_cu_count (kernel
->agent
) * (32 / dims
[1]);
3195 else if (dims
[0] > 0 && dims
[1] == 0)
3197 /* Auto-scale the number of workers with the requested number of gangs. */
3198 dims
[1] = get_cu_count (kernel
->agent
) * 32 / dims
[0];
3205 /* The incoming dimensions are expressed in terms of gangs, workers, and
3206 vectors. The HSA dimensions are expressed in terms of "work-items",
3207 which means multiples of vector lanes.
3209 The "grid size" specifies the size of the problem space, and the
3210 "work-group size" specifies how much of that we want a single compute
3211 unit to chew on at once.
3213 The three dimensions do not really correspond to hardware, but the
3214 important thing is that the HSA runtime will launch as many
3215 work-groups as it takes to process the entire grid, and each
3216 work-group will contain as many wave-fronts as it takes to process
3217 the work-items in that group.
3219 Essentially, as long as we set the Y dimension to 64 (the number of
3220 vector lanes in hardware), and the Z group size to the maximum (16),
3221 then we will get the gangs (X) and workers (Z) launched as we expect.
3223 The reason for the apparent reversal of vector and worker dimension
3224 order is to do with the way the run-time distributes work-items across
3226 struct GOMP_kernel_launch_attributes kla
=
3229 {dims
[0], 64, dims
[1]},
3230 /* Work-group size. */
3234 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3235 acc_prof_info
*prof_info
= thr
->prof_info
;
3236 acc_event_info enqueue_launch_event_info
;
3237 acc_api_info
*api_info
= thr
->api_info
;
3238 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3239 if (profiling_dispatch_p
)
3241 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3243 enqueue_launch_event_info
.launch_event
.event_type
3244 = prof_info
->event_type
;
3245 enqueue_launch_event_info
.launch_event
.valid_bytes
3246 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3247 enqueue_launch_event_info
.launch_event
.parent_construct
3248 = acc_construct_parallel
;
3249 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3250 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3251 enqueue_launch_event_info
.launch_event
.kernel_name
3252 = (char *) kernel
->name
;
3253 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3254 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3255 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3257 api_info
->device_api
= acc_device_api_other
;
3259 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3260 &enqueue_launch_event_info
, api_info
);
3264 run_kernel (kernel
, devaddrs
, &kla
, NULL
, false);
3266 queue_push_launch (aq
, kernel
, devaddrs
, &kla
);
3268 if (profiling_dispatch_p
)
3270 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3271 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3272 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3273 &enqueue_launch_event_info
,
3279 /* {{{ Generic Plugin API */
3281 /* Return the name of the accelerator, which is "gcn". */
3284 GOMP_OFFLOAD_get_name (void)
3289 /* Return the specific capabilities the HSA accelerator have. */
3292 GOMP_OFFLOAD_get_caps (void)
3294 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3295 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3296 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3299 /* Identify as GCN accelerator. */
3302 GOMP_OFFLOAD_get_type (void)
3304 return OFFLOAD_TARGET_TYPE_GCN
;
3307 /* Return the libgomp version number we're compatible with. There is
3308 no requirement for cross-version compatibility. */
3311 GOMP_OFFLOAD_version (void)
3313 return GOMP_VERSION
;
3316 /* Return the number of GCN devices on the system. */
3319 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
3321 if (!init_hsa_context ())
3323 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3324 devices were present. */
3325 if (hsa_context
.agent_count
> 0
3326 && ((omp_requires_mask
3327 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3328 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
3330 return hsa_context
.agent_count
;
3333 /* Initialize device (agent) number N so that it can be used for computation.
3334 Return TRUE on success. */
3337 GOMP_OFFLOAD_init_device (int n
)
3339 if (!init_hsa_context ())
3341 if (n
>= hsa_context
.agent_count
)
3343 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3346 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3348 if (agent
->initialized
)
3351 agent
->device_id
= n
;
3353 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3355 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3358 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3360 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3363 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3365 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3368 if (pthread_mutex_init (&agent
->ephemeral_memories_write_lock
, NULL
))
3370 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3373 agent
->async_queues
= NULL
;
3374 agent
->omp_async_queue
= NULL
;
3375 agent
->ephemeral_memories_list
= NULL
;
3377 uint32_t queue_size
;
3378 hsa_status_t status
;
3379 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3380 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3382 if (status
!= HSA_STATUS_SUCCESS
)
3383 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3386 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3388 if (status
!= HSA_STATUS_SUCCESS
)
3389 return hsa_error ("Error querying the name of the agent", status
);
3391 agent
->device_isa
= isa_code (agent
->name
);
3392 if (agent
->device_isa
== EF_AMDGPU_MACH_UNSUPPORTED
)
3393 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR
);
3395 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_VENDOR_NAME
,
3396 &agent
->vendor_name
);
3397 if (status
!= HSA_STATUS_SUCCESS
)
3398 return hsa_error ("Error querying the vendor name of the agent", status
);
3400 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3401 HSA_QUEUE_TYPE_MULTI
,
3402 hsa_queue_callback
, NULL
, UINT32_MAX
,
3403 UINT32_MAX
, &agent
->sync_queue
);
3404 if (status
!= HSA_STATUS_SUCCESS
)
3405 return hsa_error ("Error creating command queue", status
);
3407 agent
->kernarg_region
.handle
= (uint64_t) -1;
3408 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3409 get_kernarg_memory_region
,
3410 &agent
->kernarg_region
);
3411 if (status
!= HSA_STATUS_SUCCESS
3412 && status
!= HSA_STATUS_INFO_BREAK
)
3413 hsa_error ("Scanning memory regions failed", status
);
3414 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3416 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3420 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3421 dump_hsa_region (agent
->kernarg_region
, NULL
);
3423 agent
->data_region
.handle
= (uint64_t) -1;
3424 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3425 get_data_memory_region
,
3426 &agent
->data_region
);
3427 if (status
!= HSA_STATUS_SUCCESS
3428 && status
!= HSA_STATUS_INFO_BREAK
)
3429 hsa_error ("Scanning memory regions failed", status
);
3430 if (agent
->data_region
.handle
== (uint64_t) -1)
3432 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3436 GCN_DEBUG ("Selected device data memory region:\n");
3437 dump_hsa_region (agent
->data_region
, NULL
);
3439 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3441 agent
->initialized
= true;
3445 /* Load GCN object-code module described by struct gcn_image_desc in
3446 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3447 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3448 contain the on-device addresses of the functions for reverse offload. To be
3449 freed by the caller. */
3452 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3453 struct addr_pair
**target_table
,
3454 uint64_t **rev_fn_table
,
3455 uint64_t *host_ind_fn_table
)
3457 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3459 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3460 " (expected %u, received %u)",
3461 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3465 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3466 struct agent_info
*agent
;
3467 struct addr_pair
*pair
;
3468 struct module_info
*module
;
3469 struct kernel_info
*kernel
;
3470 int kernel_count
= image_desc
->kernel_count
;
3471 unsigned ind_func_count
= GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version
)
3472 ? image_desc
->ind_func_count
: 0;
3473 unsigned var_count
= image_desc
->global_variable_count
;
3474 /* Currently, "others" is a struct of ICVS. */
3475 int other_count
= 1;
3477 agent
= get_agent_info (ord
);
3481 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3483 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3486 if (agent
->prog_finalized
3487 && !destroy_hsa_program (agent
))
3490 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3491 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count
);
3492 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3493 GCN_DEBUG ("Expect %d other variables in an image\n", other_count
);
3494 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
+ other_count
- 2)
3495 * sizeof (struct addr_pair
));
3496 *target_table
= pair
;
3497 module
= (struct module_info
*)
3498 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3499 + kernel_count
* sizeof (struct kernel_info
));
3500 module
->image_desc
= image_desc
;
3501 module
->kernel_count
= kernel_count
;
3502 module
->heap
= NULL
;
3503 module
->constructors_run_p
= false;
3505 kernel
= &module
->kernels
[0];
3507 /* Allocate memory for kernel dependencies. */
3508 for (unsigned i
= 0; i
< kernel_count
; i
++)
3510 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3511 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3513 if (strcmp (d
->name
, "_init_array") == 0)
3514 module
->init_array_func
= kernel
;
3515 else if (strcmp (d
->name
, "_fini_array") == 0)
3516 module
->fini_array_func
= kernel
;
3519 pair
->start
= (uintptr_t) kernel
;
3520 pair
->end
= (uintptr_t) (kernel
+ 1);
3526 agent
->module
= module
;
3527 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3529 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3533 if (!create_and_finalize_hsa_program (agent
))
3538 hsa_status_t status
;
3539 hsa_executable_symbol_t var_symbol
;
3540 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3541 ".offload_var_table",
3545 if (status
!= HSA_STATUS_SUCCESS
)
3546 hsa_fatal ("Could not find symbol for variable in the code object",
3549 uint64_t var_table_addr
;
3550 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3551 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3553 if (status
!= HSA_STATUS_SUCCESS
)
3554 hsa_fatal ("Could not extract a variable from its symbol", status
);
3559 } var_table
[var_count
];
3560 GOMP_OFFLOAD_dev2host (agent
->device_id
, var_table
,
3561 (void*)var_table_addr
, sizeof (var_table
));
3563 for (unsigned i
= 0; i
< var_count
; i
++)
3565 pair
->start
= var_table
[i
].addr
;
3566 pair
->end
= var_table
[i
].addr
+ var_table
[i
].size
;
3567 GCN_DEBUG ("Found variable at %p with size %lu\n",
3568 (void *)var_table
[i
].addr
, var_table
[i
].size
);
3573 if (ind_func_count
> 0)
3575 hsa_status_t status
;
3577 /* Read indirect function table from image. */
3578 hsa_executable_symbol_t ind_funcs_symbol
;
3579 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3580 ".offload_ind_func_table",
3582 0, &ind_funcs_symbol
);
3584 if (status
!= HSA_STATUS_SUCCESS
)
3585 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3586 "code object", status
);
3588 uint64_t ind_funcs_table_addr
;
3589 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3590 (ind_funcs_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3591 &ind_funcs_table_addr
);
3592 if (status
!= HSA_STATUS_SUCCESS
)
3593 hsa_fatal ("Could not extract a variable from its symbol", status
);
3595 uint64_t ind_funcs_table
[ind_func_count
];
3596 GOMP_OFFLOAD_dev2host (agent
->device_id
, ind_funcs_table
,
3597 (void*) ind_funcs_table_addr
,
3598 sizeof (ind_funcs_table
));
3600 /* Build host->target address map for indirect functions. */
3601 uint64_t ind_fn_map
[ind_func_count
* 2 + 1];
3602 for (unsigned i
= 0; i
< ind_func_count
; i
++)
3604 ind_fn_map
[i
* 2] = host_ind_fn_table
[i
];
3605 ind_fn_map
[i
* 2 + 1] = ind_funcs_table
[i
];
3606 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3607 i
, host_ind_fn_table
[i
], ind_funcs_table
[i
]);
3609 ind_fn_map
[ind_func_count
* 2] = 0;
3611 /* Write the map onto the target. */
3612 void *map_target_addr
3613 = GOMP_OFFLOAD_alloc (agent
->device_id
, sizeof (ind_fn_map
));
3614 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr
);
3616 GOMP_OFFLOAD_host2dev (agent
->device_id
, map_target_addr
,
3618 sizeof (ind_fn_map
));
3620 /* Write address of the map onto the target. */
3621 hsa_executable_symbol_t symbol
;
3624 = hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3625 XSTRING (GOMP_INDIRECT_ADDR_MAP
),
3626 agent
->id
, 0, &symbol
);
3627 if (status
!= HSA_STATUS_SUCCESS
)
3628 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3634 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3635 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3637 if (status
!= HSA_STATUS_SUCCESS
)
3638 hsa_fatal ("Could not extract a variable from its symbol", status
);
3639 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3640 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3642 if (status
!= HSA_STATUS_SUCCESS
)
3643 hsa_fatal ("Could not extract a variable size from its symbol",
3646 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3649 GOMP_OFFLOAD_host2dev (agent
->device_id
, (void *) varptr
,
3651 sizeof (map_target_addr
));
3654 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS
));
3656 hsa_status_t status
;
3657 hsa_executable_symbol_t var_symbol
;
3658 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3659 XSTRING (GOMP_ADDITIONAL_ICVS
),
3660 agent
->id
, 0, &var_symbol
);
3661 if (status
== HSA_STATUS_SUCCESS
)
3666 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3667 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3669 if (status
!= HSA_STATUS_SUCCESS
)
3670 hsa_fatal ("Could not extract a variable from its symbol", status
);
3671 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3672 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3674 if (status
!= HSA_STATUS_SUCCESS
)
3675 hsa_fatal ("Could not extract a variable size from its symbol",
3678 pair
->start
= varptr
;
3679 pair
->end
= varptr
+ varsize
;
3683 /* The variable was not in this image. */
3684 GCN_DEBUG ("Variable not found in image: %s\n",
3685 XSTRING (GOMP_ADDITIONAL_ICVS
));
3686 pair
->start
= pair
->end
= 0;
3689 /* Ensure that constructors are run first. */
3690 struct GOMP_kernel_launch_attributes kla
=
3694 /* Work-group size. */
3698 if (module
->init_array_func
)
3700 init_kernel (module
->init_array_func
);
3701 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3703 module
->constructors_run_p
= true;
3705 /* Don't report kernels that libgomp need not know about. */
3706 if (module
->init_array_func
)
3708 if (module
->fini_array_func
)
3711 if (rev_fn_table
!= NULL
&& kernel_count
== 0)
3712 *rev_fn_table
= NULL
;
3713 else if (rev_fn_table
!= NULL
)
3715 hsa_status_t status
;
3716 hsa_executable_symbol_t var_symbol
;
3717 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3718 ".offload_func_table",
3719 agent
->id
, 0, &var_symbol
);
3720 if (status
!= HSA_STATUS_SUCCESS
)
3721 hsa_fatal ("Could not find symbol for variable in the code object",
3723 uint64_t fn_table_addr
;
3724 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3725 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3727 if (status
!= HSA_STATUS_SUCCESS
)
3728 hsa_fatal ("Could not extract a variable from its symbol", status
);
3729 *rev_fn_table
= GOMP_PLUGIN_malloc (kernel_count
* sizeof (uint64_t));
3730 GOMP_OFFLOAD_dev2host (agent
->device_id
, *rev_fn_table
,
3731 (void*) fn_table_addr
,
3732 kernel_count
* sizeof (uint64_t));
3735 return kernel_count
+ var_count
+ other_count
;
3738 /* Unload GCN object-code module described by struct gcn_image_desc in
3739 TARGET_DATA from agent number N. Return TRUE on success. */
3742 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3744 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3746 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3747 " (expected %u, received %u)",
3748 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3752 struct agent_info
*agent
;
3753 agent
= get_agent_info (n
);
3757 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3759 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3763 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3765 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3770 if (!destroy_module (agent
->module
, true))
3772 free (agent
->module
);
3773 agent
->module
= NULL
;
3774 if (!destroy_hsa_program (agent
))
3776 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3778 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3784 /* Deinitialize all information and status associated with agent number N. We
3785 do not attempt any synchronization, assuming the user and libgomp will not
3786 attempt deinitialization of a device that is in any way being used at the
3787 same time. Return TRUE on success. */
3790 GOMP_OFFLOAD_fini_device (int n
)
3792 struct agent_info
*agent
= get_agent_info (n
);
3796 if (!agent
->initialized
)
3799 if (agent
->omp_async_queue
)
3801 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3802 agent
->omp_async_queue
= NULL
;
3807 if (!destroy_module (agent
->module
, false))
3809 free (agent
->module
);
3810 agent
->module
= NULL
;
3813 if (!destroy_ephemeral_memories (agent
))
3816 if (!destroy_hsa_program (agent
))
3819 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3820 if (status
!= HSA_STATUS_SUCCESS
)
3821 return hsa_error ("Error destroying command queue", status
);
3823 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3825 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3828 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3830 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3834 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3836 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3839 if (pthread_mutex_destroy (&agent
->ephemeral_memories_write_lock
))
3841 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3844 agent
->initialized
= false;
3848 /* Return true if the HSA runtime can run function FN_PTR. */
3851 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3853 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3855 init_kernel (kernel
);
3856 if (kernel
->initialization_failed
)
3862 if (suppress_host_fallback
)
3863 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3864 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3868 /* Allocate memory on device N. */
3871 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3873 struct agent_info
*agent
= get_agent_info (n
);
3874 return alloc_by_agent (agent
, size
);
3877 /* Free memory from device N. */
3880 GOMP_OFFLOAD_free (int device
, void *ptr
)
3882 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3884 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3885 if (status
!= HSA_STATUS_SUCCESS
)
3887 hsa_error ("Could not free device memory", status
);
3891 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3892 bool profiling_dispatch_p
3893 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3894 if (profiling_dispatch_p
)
3896 acc_prof_info
*prof_info
= thr
->prof_info
;
3897 acc_event_info data_event_info
;
3898 acc_api_info
*api_info
= thr
->api_info
;
3900 prof_info
->event_type
= acc_ev_free
;
3902 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3903 data_event_info
.data_event
.valid_bytes
3904 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3905 data_event_info
.data_event
.parent_construct
3906 = acc_construct_parallel
;
3907 data_event_info
.data_event
.implicit
= 1;
3908 data_event_info
.data_event
.tool_info
= NULL
;
3909 data_event_info
.data_event
.var_name
= NULL
;
3910 data_event_info
.data_event
.bytes
= 0;
3911 data_event_info
.data_event
.host_ptr
= NULL
;
3912 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3914 api_info
->device_api
= acc_device_api_other
;
3916 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3923 /* Copy data from DEVICE to host. */
3926 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3928 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3930 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3931 if (status
!= HSA_STATUS_SUCCESS
)
3932 GOMP_PLUGIN_error ("memory copy failed");
3936 /* Copy data from host to DEVICE. */
3939 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3941 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3943 hsa_memory_copy_wrapper (dst
, src
, n
);
3947 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3950 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3952 struct gcn_thread
*thread_data
= gcn_thread ();
3954 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3956 struct agent_info
*agent
= get_agent_info (device
);
3957 maybe_init_omp_async (agent
);
3958 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
);
3962 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
3963 device
, src
, device
, dst
);
3964 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3965 if (status
!= HSA_STATUS_SUCCESS
)
3966 GOMP_PLUGIN_error ("memory copy failed");
3970 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3971 measured in bytes. So we have:
3973 dim1_size: number of bytes to copy on innermost dimension ("row")
3974 dim0_len: number of rows to copy
3975 dst: base pointer for destination of copy
3976 dst_offset1_size: innermost row offset (for dest), in bytes
3977 dst_offset0_len: offset, number of rows (for dest)
3978 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3979 src: base pointer for source of copy
3980 src_offset1_size: innermost row offset (for source), in bytes
3981 src_offset0_len: offset, number of rows (for source)
3982 src_dim1_size: whole-array source row length, in bytes (pitch)
3986 GOMP_OFFLOAD_memcpy2d (int dst_ord
, int src_ord
, size_t dim1_size
,
3987 size_t dim0_len
, void *dst
, size_t dst_offset1_size
,
3988 size_t dst_offset0_len
, size_t dst_dim1_size
,
3989 const void *src
, size_t src_offset1_size
,
3990 size_t src_offset0_len
, size_t src_dim1_size
)
3992 if (!hsa_fns
.hsa_amd_memory_lock_fn
3993 || !hsa_fns
.hsa_amd_memory_unlock_fn
3994 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
3997 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3998 out quietly if we have anything oddly-aligned rather than letting the
3999 driver raise an error. */
4000 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
4003 if ((dst_dim1_size
& 3) != 0 || (src_dim1_size
& 3) != 0)
4006 /* Only handle host to device or device to host transfers here. */
4007 if ((dst_ord
== -1 && src_ord
== -1)
4008 || (dst_ord
!= -1 && src_ord
!= -1))
4011 hsa_amd_copy_direction_t dir
4012 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
4013 hsa_agent_t copy_agent
;
4015 /* We need to pin (lock) host memory before we start the transfer. Try to
4016 lock the minimum size necessary, i.e. using partial first/last rows of the
4017 whole array. Something like this:
4021 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4022 o | ++#######+++++ <- whole row
4023 l | ++#######+++++ <- "
4024 s v ++#######..... <- last row apart from trailing remainder
4027 We could split very large transfers into several rectangular copies, but
4028 that is unimplemented for now. */
4030 size_t bounded_size_host
, first_elem_offset_host
;
4032 if (dir
== hsaHostToDevice
)
4034 bounded_size_host
= src_dim1_size
* (dim0_len
- 1) + dim1_size
;
4035 first_elem_offset_host
= src_offset0_len
* src_dim1_size
4037 host_ptr
= (void *) src
;
4038 struct agent_info
*agent
= get_agent_info (dst_ord
);
4039 copy_agent
= agent
->id
;
4043 bounded_size_host
= dst_dim1_size
* (dim0_len
- 1) + dim1_size
;
4044 first_elem_offset_host
= dst_offset0_len
* dst_dim1_size
4047 struct agent_info
*agent
= get_agent_info (src_ord
);
4048 copy_agent
= agent
->id
;
4054 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4055 bounded_size_host
, NULL
, 0, &agent_ptr
);
4056 /* We can't lock the host memory: don't give up though, we might still be
4057 able to use the slow path in our caller. So, don't make this an
4059 if (status
!= HSA_STATUS_SUCCESS
)
4062 hsa_pitched_ptr_t dstpp
, srcpp
;
4063 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4067 hsa_signal_t completion_signal
;
4068 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4069 if (status
!= HSA_STATUS_SUCCESS
)
4075 if (dir
== hsaHostToDevice
)
4077 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4082 srcpp
.base
= (void *) src
;
4083 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4086 srcpp
.pitch
= src_dim1_size
;
4089 src_offsets
.x
= src_offset1_size
;
4090 src_offsets
.y
= src_offset0_len
;
4093 dstpp
.pitch
= dst_dim1_size
;
4096 dst_offsets
.x
= dst_offset1_size
;
4097 dst_offsets
.y
= dst_offset0_len
;
4100 ranges
.x
= dim1_size
;
4101 ranges
.y
= dim0_len
;
4105 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4106 &src_offsets
, &ranges
,
4107 copy_agent
, dir
, 0, NULL
,
4109 /* If the rectangular copy fails, we might still be able to use the slow
4110 path. We need to unlock the host memory though, so don't return
4112 if (status
!= HSA_STATUS_SUCCESS
)
4115 hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4116 HSA_SIGNAL_CONDITION_LT
, 1, UINT64_MAX
,
4117 HSA_WAIT_STATE_ACTIVE
);
4119 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4122 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4123 if (status
!= HSA_STATUS_SUCCESS
)
4124 hsa_fatal ("Could not unlock host memory", status
);
4129 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4130 measured in bytes. So we have:
4132 dim2_size: number of bytes to copy on innermost dimension ("row")
4133 dim1_len: number of rows per slice to copy
4134 dim0_len: number of slices to copy
4135 dst: base pointer for destination of copy
4136 dst_offset2_size: innermost row offset (for dest), in bytes
4137 dst_offset1_len: offset, number of rows (for dest)
4138 dst_offset0_len: offset, number of slices (for dest)
4139 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4140 dst_dim1_len: whole-array number of rows in slice (for dest)
4141 src: base pointer for source of copy
4142 src_offset2_size: innermost row offset (for source), in bytes
4143 src_offset1_len: offset, number of rows (for source)
4144 src_offset0_len: offset, number of slices (for source)
4145 src_dim2_size: whole-array source row length, in bytes (pitch)
4146 src_dim1_len: whole-array number of rows in slice (for source)
4150 GOMP_OFFLOAD_memcpy3d (int dst_ord
, int src_ord
, size_t dim2_size
,
4151 size_t dim1_len
, size_t dim0_len
, void *dst
,
4152 size_t dst_offset2_size
, size_t dst_offset1_len
,
4153 size_t dst_offset0_len
, size_t dst_dim2_size
,
4154 size_t dst_dim1_len
, const void *src
,
4155 size_t src_offset2_size
, size_t src_offset1_len
,
4156 size_t src_offset0_len
, size_t src_dim2_size
,
4157 size_t src_dim1_len
)
4159 if (!hsa_fns
.hsa_amd_memory_lock_fn
4160 || !hsa_fns
.hsa_amd_memory_unlock_fn
4161 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
4164 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4165 out quietly if we have anything oddly-aligned rather than letting the
4166 driver raise an error. */
4167 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
4170 if ((dst_dim2_size
& 3) != 0 || (src_dim2_size
& 3) != 0)
4173 /* Only handle host to device or device to host transfers here. */
4174 if ((dst_ord
== -1 && src_ord
== -1)
4175 || (dst_ord
!= -1 && src_ord
!= -1))
4178 hsa_amd_copy_direction_t dir
4179 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
4180 hsa_agent_t copy_agent
;
4182 /* We need to pin (lock) host memory before we start the transfer. Try to
4183 lock the minimum size necessary, i.e. using partial first/last slices of
4184 the whole 3D array. Something like this:
4186 slice 0: slice 1: slice 2:
4187 __________ __________ __________
4188 ^ /+++++++++/ : /+++++++++/ : / /
4189 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4190 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4191 /_________/ : /_________/ : /_________/
4194 We could split very large transfers into several rectangular copies, but
4195 that is unimplemented for now. */
4197 size_t bounded_size_host
, first_elem_offset_host
;
4199 if (dir
== hsaHostToDevice
)
4201 size_t slice_bytes
= src_dim2_size
* src_dim1_len
;
4202 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4203 + src_dim2_size
* (dim1_len
- 1)
4205 first_elem_offset_host
= src_offset0_len
* slice_bytes
4206 + src_offset1_len
* src_dim2_size
4208 host_ptr
= (void *) src
;
4209 struct agent_info
*agent
= get_agent_info (dst_ord
);
4210 copy_agent
= agent
->id
;
4214 size_t slice_bytes
= dst_dim2_size
* dst_dim1_len
;
4215 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4216 + dst_dim2_size
* (dim1_len
- 1)
4218 first_elem_offset_host
= dst_offset0_len
* slice_bytes
4219 + dst_offset1_len
* dst_dim2_size
4222 struct agent_info
*agent
= get_agent_info (src_ord
);
4223 copy_agent
= agent
->id
;
4229 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4230 bounded_size_host
, NULL
, 0, &agent_ptr
);
4231 /* We can't lock the host memory: don't give up though, we might still be
4232 able to use the slow path in our caller (maybe even with iterated memcpy2d
4233 calls). So, don't make this an error. */
4234 if (status
!= HSA_STATUS_SUCCESS
)
4237 hsa_pitched_ptr_t dstpp
, srcpp
;
4238 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4242 hsa_signal_t completion_signal
;
4243 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4244 if (status
!= HSA_STATUS_SUCCESS
)
4250 if (dir
== hsaHostToDevice
)
4252 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4257 srcpp
.base
= (void *) src
;
4258 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4261 /* Pitch is measured in bytes. */
4262 srcpp
.pitch
= src_dim2_size
;
4263 /* Slice is also measured in bytes (i.e. total per-slice). */
4264 srcpp
.slice
= src_dim2_size
* src_dim1_len
;
4266 src_offsets
.x
= src_offset2_size
;
4267 src_offsets
.y
= src_offset1_len
;
4268 src_offsets
.z
= src_offset0_len
;
4271 dstpp
.pitch
= dst_dim2_size
;
4272 dstpp
.slice
= dst_dim2_size
* dst_dim1_len
;
4274 dst_offsets
.x
= dst_offset2_size
;
4275 dst_offsets
.y
= dst_offset1_len
;
4276 dst_offsets
.z
= dst_offset0_len
;
4278 ranges
.x
= dim2_size
;
4279 ranges
.y
= dim1_len
;
4280 ranges
.z
= dim0_len
;
4283 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4284 &src_offsets
, &ranges
,
4285 copy_agent
, dir
, 0, NULL
,
4287 /* If the rectangular copy fails, we might still be able to use the slow
4288 path. We need to unlock the host memory though, so don't return
4290 if (status
!= HSA_STATUS_SUCCESS
)
4294 hsa_signal_value_t sv
4295 = hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4296 HSA_SIGNAL_CONDITION_LT
, 1,
4298 HSA_WAIT_STATE_ACTIVE
);
4301 GCN_WARNING ("async copy rect failure");
4306 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4309 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4310 if (status
!= HSA_STATUS_SUCCESS
)
4311 hsa_fatal ("Could not unlock host memory", status
);
4317 /* {{{ OpenMP Plugin API */
4319 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4320 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4321 to a kernel_info structure, and must have previously been loaded to the
4322 specified device. */
4325 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
4327 struct agent_info
*agent
= get_agent_info (device
);
4328 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4329 struct GOMP_kernel_launch_attributes def
;
4330 struct GOMP_kernel_launch_attributes
*kla
;
4331 assert (agent
== kernel
->agent
);
4333 /* If we get here then the kernel must be OpenMP. */
4334 kernel
->kind
= KIND_OPENMP
;
4336 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4338 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4341 run_kernel (kernel
, vars
, kla
, NULL
, false);
4344 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4345 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4346 GOMP_PLUGIN_target_task_completion when it has finished. */
4349 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
4350 void **args
, void *async_data
)
4352 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4353 struct agent_info
*agent
= get_agent_info (device
);
4354 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
4355 struct GOMP_kernel_launch_attributes def
;
4356 struct GOMP_kernel_launch_attributes
*kla
;
4357 assert (agent
== kernel
->agent
);
4359 /* If we get here then the kernel must be OpenMP. */
4360 kernel
->kind
= KIND_OPENMP
;
4362 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4364 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4368 maybe_init_omp_async (agent
);
4369 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
4370 queue_push_callback (agent
->omp_async_queue
,
4371 GOMP_PLUGIN_target_task_completion
, async_data
);
4375 /* {{{ OpenACC Plugin API */
4377 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4378 already-loaded KERNEL. */
4381 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *),
4382 size_t mapnum
__attribute__((unused
)),
4383 void **hostaddrs
__attribute__((unused
)),
4384 void **devaddrs
, unsigned *dims
,
4385 void *targ_mem_desc
)
4387 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4389 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, false, NULL
);
4392 /* Run an asynchronous OpenACC kernel on the specified queue. */
4395 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *),
4396 size_t mapnum
__attribute__((unused
)),
4397 void **hostaddrs
__attribute__((unused
)),
4399 unsigned *dims
, void *targ_mem_desc
,
4400 struct goacc_asyncqueue
*aq
)
4402 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4404 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, true, aq
);
4407 /* Create a new asynchronous thread and queue for running future kernels. */
4409 struct goacc_asyncqueue
*
4410 GOMP_OFFLOAD_openacc_async_construct (int device
)
4412 struct agent_info
*agent
= get_agent_info (device
);
4414 pthread_mutex_lock (&agent
->async_queues_mutex
);
4416 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
4417 aq
->agent
= get_agent_info (device
);
4419 aq
->next
= agent
->async_queues
;
4422 aq
->next
->prev
= aq
;
4423 aq
->id
= aq
->next
->id
+ 1;
4427 agent
->async_queues
= aq
;
4429 aq
->queue_first
= 0;
4431 aq
->drain_queue_stop
= 0;
4433 if (pthread_mutex_init (&aq
->mutex
, NULL
))
4435 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4438 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
4440 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4443 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
4445 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4449 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
4451 HSA_QUEUE_TYPE_MULTI
,
4452 hsa_queue_callback
, NULL
,
4453 UINT32_MAX
, UINT32_MAX
,
4455 if (status
!= HSA_STATUS_SUCCESS
)
4456 hsa_fatal ("Error creating command queue", status
);
4458 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
4460 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4462 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
4465 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4470 /* Destroy an existing asynchronous thread and queue. Waits for any
4471 currently-running task to complete, but cancels any queued tasks. */
4474 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
4476 struct agent_info
*agent
= aq
->agent
;
4478 finalize_async_thread (aq
);
4480 pthread_mutex_lock (&agent
->async_queues_mutex
);
4483 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
4485 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
4488 if (pthread_cond_destroy (&aq
->queue_cond_in
))
4490 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4493 if (pthread_cond_destroy (&aq
->queue_cond_out
))
4495 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4498 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
4499 if (status
!= HSA_STATUS_SUCCESS
)
4501 hsa_error ("Error destroying command queue", status
);
4506 aq
->prev
->next
= aq
->next
;
4508 aq
->next
->prev
= aq
->prev
;
4509 if (agent
->async_queues
== aq
)
4510 agent
->async_queues
= aq
->next
;
4512 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
4515 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4519 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4523 /* Return true if the specified async queue is currently empty. */
4526 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
4528 return queue_empty (aq
);
4531 /* Block until the specified queue has executed all its tasks and the
4535 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
4541 /* Add a serialization point across two async queues. Any new tasks added to
4542 AQ2, after this call, will not run until all tasks on AQ1, at the time
4543 of this call, have completed. */
4546 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
4547 struct goacc_asyncqueue
*aq2
)
4549 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4550 scheduled to run on it up to this point. */
4553 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
4554 queue_push_asyncwait (aq2
, placeholderp
);
4559 /* Add an opaque callback to the given async queue. */
4562 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
4563 void (*fn
) (void *), void *data
)
4565 queue_push_callback (aq
, fn
, data
);
4568 /* Queue up an asynchronous data copy from host to DEVICE. */
4571 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
4572 size_t n
, struct goacc_asyncqueue
*aq
)
4574 struct agent_info
*agent
= get_agent_info (device
);
4575 assert (agent
== aq
->agent
);
4576 queue_push_copy (aq
, dst
, src
, n
);
4580 /* Queue up an asynchronous data copy from DEVICE to host. */
4583 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
4584 size_t n
, struct goacc_asyncqueue
*aq
)
4586 struct agent_info
*agent
= get_agent_info (device
);
4587 assert (agent
== aq
->agent
);
4588 queue_push_copy (aq
, dst
, src
, n
);
4592 union goacc_property_value
4593 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
4595 struct agent_info
*agent
= get_agent_info (device
);
4597 union goacc_property_value propval
= { .val
= 0 };
4601 case GOACC_PROPERTY_FREE_MEMORY
:
4602 /* Not supported. */
4604 case GOACC_PROPERTY_MEMORY
:
4607 hsa_region_t region
= agent
->data_region
;
4608 hsa_status_t status
=
4609 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
4610 if (status
== HSA_STATUS_SUCCESS
)
4614 case GOACC_PROPERTY_NAME
:
4615 propval
.ptr
= agent
->name
;
4617 case GOACC_PROPERTY_VENDOR
:
4618 propval
.ptr
= agent
->vendor_name
;
4620 case GOACC_PROPERTY_DRIVER
:
4621 propval
.ptr
= hsa_context
.driver_version_s
;
4628 /* Set up plugin-specific thread-local-data (host-side). */
4631 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
4633 struct gcn_thread
*thread_data
4634 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
4636 thread_data
->async
= GOMP_ASYNC_SYNC
;
4638 return (void *) thread_data
;
4641 /* Clean up plugin-specific thread-local-data. */
4644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)