1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
41 #include <hsa_ext_amd.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
48 #include "oacc-plugin.h"
52 /* These probably won't be in elf.h for a while. */
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
93 secure_getenv (const char *name
)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
102 #define secure_getenv getenv
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
123 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
124 const char **status_string
);
125 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
127 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
128 hsa_agent_info_t attribute
,
130 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
131 hsa_isa_info_t attribute
,
134 hsa_status_t (*hsa_init_fn
) (void);
135 hsa_status_t (*hsa_iterate_agents_fn
)
136 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
137 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
138 hsa_region_info_t attribute
,
140 hsa_status_t (*hsa_queue_create_fn
)
141 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
142 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
143 void *data
, uint32_t private_segment_size
,
144 uint32_t group_segment_size
, hsa_queue_t
**queue
);
145 hsa_status_t (*hsa_agent_iterate_regions_fn
)
147 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
148 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
149 hsa_status_t (*hsa_executable_create_fn
)
150 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
151 const char *options
, hsa_executable_t
*executable
);
152 hsa_status_t (*hsa_executable_global_variable_define_fn
)
153 (hsa_executable_t executable
, const char *variable_name
, void *address
);
154 hsa_status_t (*hsa_executable_load_code_object_fn
)
155 (hsa_executable_t executable
, hsa_agent_t agent
,
156 hsa_code_object_t code_object
, const char *options
);
157 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
158 const char *options
);
159 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
160 uint32_t num_consumers
,
161 const hsa_agent_t
*consumers
,
162 hsa_signal_t
*signal
);
163 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
165 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
166 hsa_access_permission_t access
);
167 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
168 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
169 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
170 hsa_status_t (*hsa_executable_get_symbol_fn
)
171 (hsa_executable_t executable
, const char *module_name
,
172 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
173 hsa_executable_symbol_t
*symbol
);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
175 (hsa_executable_symbol_t executable_symbol
,
176 hsa_executable_symbol_info_t attribute
, void *value
);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
178 (hsa_executable_t executable
,
179 hsa_status_t (*callback
)(hsa_executable_t executable
,
180 hsa_executable_symbol_t symbol
, void *data
),
182 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
184 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
185 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
186 hsa_signal_value_t value
);
187 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
188 hsa_signal_value_t value
);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
190 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
191 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
192 hsa_wait_state_t wait_state_hint
);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
194 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
196 hsa_status_t (*hsa_code_object_deserialize_fn
)
197 (void *serialized_code_object
, size_t serialized_code_object_size
,
198 const char *options
, hsa_code_object_t
*code_object
);
199 hsa_status_t (*hsa_amd_memory_lock_fn
)
200 (void *host_ptr
, size_t size
, hsa_agent_t
*agents
, int num_agent
,
202 hsa_status_t (*hsa_amd_memory_unlock_fn
) (void *host_ptr
);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn
)
204 (const hsa_pitched_ptr_t
*dst
, const hsa_dim3_t
*dst_offset
,
205 const hsa_pitched_ptr_t
*src
, const hsa_dim3_t
*src_offset
,
206 const hsa_dim3_t
*range
, hsa_agent_t copy_agent
,
207 hsa_amd_copy_direction_t dir
, uint32_t num_dep_signals
,
208 const hsa_signal_t
*dep_signals
, hsa_signal_t completion_signal
);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
218 /* Size of the grid in the three respective dimensions. */
220 /* Size of work-groups in the respective dimensions. */
224 /* Collection of information needed for a dispatch of a kernel from a
227 struct kernel_dispatch
229 struct agent_info
*agent
;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address
;
236 /* Synchronization signal used for dispatch synchronization. */
238 /* Private segment size. */
239 uint32_t private_segment_size
;
240 /* Group segment size. */
241 uint32_t group_segment_size
;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
250 struct kernargs_abi abi
;
253 struct output output_data
;
256 /* A queue entry for a future asynchronous launch. */
260 struct kernel_info
*kernel
;
262 struct GOMP_kernel_launch_attributes kla
;
265 /* A queue entry for a future callback. */
273 /* A data struct for the copy_data callback. */
280 struct goacc_asyncqueue
*aq
;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
289 pthread_mutex_t mutex
;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder
*placeholderp
;
299 /* Encode the type of an entry in an async queue. */
309 /* An entry in an async queue. */
313 enum entry_type type
;
315 struct kernel_launch launch
;
316 struct callback callback
;
317 struct asyncwait_info asyncwait
;
318 struct placeholder placeholder
;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info
*agent
;
330 hsa_queue_t
*hsa_queue
;
332 pthread_t thread_drain_queue
;
333 pthread_mutex_t mutex
;
334 pthread_cond_t queue_cond_in
;
335 pthread_cond_t queue_cond_out
;
336 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
339 int drain_queue_stop
;
342 struct goacc_asyncqueue
*prev
;
343 struct goacc_asyncqueue
*next
;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
354 int oacc_dims
[3]; /* Only present for GCN kernels. */
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
376 const unsigned kernel_count
;
377 struct hsa_kernel_description
*kernel_infos
;
378 const unsigned ind_func_count
;
379 const unsigned global_variable_count
;
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
387 EF_AMDGPU_MACH_UNSUPPORTED
= -1,
388 EF_AMDGPU_MACH_AMDGCN_GFX803
= 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900
= 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906
= 0x02f,
391 EF_AMDGPU_MACH_AMDGCN_GFX908
= 0x030,
392 EF_AMDGPU_MACH_AMDGCN_GFX90a
= 0x03f,
393 EF_AMDGPU_MACH_AMDGCN_GFX90c
= 0x032,
394 EF_AMDGPU_MACH_AMDGCN_GFX1030
= 0x036,
395 EF_AMDGPU_MACH_AMDGCN_GFX1036
= 0x045,
396 EF_AMDGPU_MACH_AMDGCN_GFX1100
= 0x041,
397 EF_AMDGPU_MACH_AMDGCN_GFX1103
= 0x044
400 const static int EF_AMDGPU_MACH_MASK
= 0x000000ff;
401 typedef EF_AMDGPU_MACH gcn_isa
;
403 /* Description of an HSA GPU agent (device) and the program associated with
408 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
410 /* The user-visible device number. */
412 /* Whether the agent has been initialized. The fields below are usable only
416 /* The instruction set architecture of the device. */
418 /* Name of the agent. */
420 /* Name of the vendor of the agent. */
421 char vendor_name
[64];
422 /* Command queues of the agent. */
423 hsa_queue_t
*sync_queue
;
424 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
425 pthread_mutex_t async_queues_mutex
;
427 /* The HSA memory region from which to allocate kernel arguments. */
428 hsa_region_t kernarg_region
;
430 /* The HSA memory region from which to allocate device data. */
431 hsa_region_t data_region
;
433 /* Allocated ephemeral memories (team arena and stack space). */
434 struct ephemeral_memories_list
*ephemeral_memories_list
;
435 pthread_mutex_t ephemeral_memories_write_lock
;
437 /* Read-write lock that protects kernels which are running or about to be run
438 from interference with loading and unloading of images. Needs to be
439 locked for reading while a kernel is being run, and for writing if the
440 list of modules is manipulated (and thus the HSA program invalidated). */
441 pthread_rwlock_t module_rwlock
;
443 /* The module associated with this kernel. */
444 struct module_info
*module
;
446 /* Mutex enforcing that only one thread will finalize the HSA program. A
447 thread should have locked agent->module_rwlock for reading before
449 pthread_mutex_t prog_mutex
;
450 /* Flag whether the HSA program that consists of all the modules has been
453 /* HSA executable - the finalized program that is used to locate kernels. */
454 hsa_executable_t executable
;
457 /* Information required to identify, finalize and run any given kernel. */
459 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
463 /* Name of the kernel, required to locate it within the GCN object-code
466 /* The specific agent the kernel has been or will be finalized for and run
468 struct agent_info
*agent
;
469 /* The specific module where the kernel takes place. */
470 struct module_info
*module
;
471 /* Information provided by mkoffload associated with the kernel. */
472 struct hsa_kernel_description
*description
;
473 /* Mutex enforcing that at most once thread ever initializes a kernel for
474 use. A thread should have locked agent->module_rwlock for reading before
476 pthread_mutex_t init_mutex
;
477 /* Flag indicating whether the kernel has been initialized and all fields
478 below it contain valid data. */
480 /* Flag indicating that the kernel has a problem that blocks an execution. */
481 bool initialization_failed
;
482 /* The object to be put into the dispatch queue. */
484 /* Required size of kernel arguments. */
485 uint32_t kernarg_segment_size
;
486 /* Required size of group segment. */
487 uint32_t group_segment_size
;
488 /* Required size of private segment. */
489 uint32_t private_segment_size
;
490 /* Set up for OpenMP or OpenACC? */
491 enum offload_kind kind
;
494 /* Information about a particular GCN module, its image and kernels. */
498 /* The description with which the program has registered the image. */
499 struct gcn_image_desc
*image_desc
;
500 /* GCN heap allocation. */
502 /* Physical boundaries of the loaded module. */
503 Elf64_Addr phys_address_start
;
504 Elf64_Addr phys_address_end
;
506 bool constructors_run_p
;
507 struct kernel_info
*init_array_func
, *fini_array_func
;
509 /* Number of kernels in this module. */
511 /* An array of kernel_info structures describing each kernel in this
513 struct kernel_info kernels
[];
516 /* A linked list of memory arenas allocated on the device.
517 These are used by OpenMP, as a means to optimize per-team malloc,
518 and for host-accessible stack space. */
520 struct ephemeral_memories_list
522 struct ephemeral_memories_list
*next
;
524 /* The size is determined by the number of teams and threads. */
526 /* The device address allocated memory. */
528 /* A flag to prevent two asynchronous kernels trying to use the same memory.
529 The mutex is locked until the kernel exits. */
530 pthread_mutex_t in_use
;
533 /* Information about the whole HSA environment and all of its agents. */
535 struct hsa_context_info
537 /* Whether the structure has been initialized. */
539 /* Number of usable GPU HSA agents in the system. */
541 /* Array of agent_info structures describing the individual HSA agents. */
542 struct agent_info
*agents
;
543 /* Driver version string. */
544 char driver_version_s
[30];
548 /* {{{ Global variables */
550 /* Information about the whole HSA environment and all of its agents. */
552 static struct hsa_context_info hsa_context
;
554 /* HSA runtime functions that are initialized in init_hsa_context. */
556 static struct hsa_runtime_fn_info hsa_fns
;
558 /* Heap space, allocated target-side, provided for use of newlib malloc.
559 Each module should have it's own heap allocated.
560 Beware that heap usage increases with OpenMP teams. See also arenas. */
562 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
564 /* Ephemeral memory sizes for each kernel launch. */
566 static int team_arena_size
= DEFAULT_TEAM_ARENA_SIZE
;
567 static int stack_size
= DEFAULT_GCN_STACK_SIZE
;
568 static int lowlat_size
= -1;
570 /* Flag to decide whether print to stderr information about what is going on.
571 Set in init_debug depending on environment variables. */
575 /* Flag to decide if the runtime should suppress a possible fallback to host
578 static bool suppress_host_fallback
;
580 /* Flag to locate HSA runtime shared library that is dlopened
583 static const char *hsa_runtime_lib
;
585 /* Flag to decide if the runtime should support also CPU devices (can be
588 static bool support_cpu_devices
;
590 /* Runtime dimension overrides. Zero indicates default. */
592 static int override_x_dim
= 0;
593 static int override_z_dim
= 0;
596 /* {{{ Debug & Diagnostic */
598 /* Print a message to stderr if GCN_DEBUG value is set to true. */
600 #define DEBUG_PRINT(...) \
605 fprintf (stderr, __VA_ARGS__); \
610 /* Flush stderr if GCN_DEBUG value is set to true. */
612 #define DEBUG_FLUSH() \
618 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
621 #define DEBUG_LOG(prefix, ...) \
624 DEBUG_PRINT (prefix); \
625 DEBUG_PRINT (__VA_ARGS__); \
629 /* Print a debugging message to stderr. */
631 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
633 /* Print a warning message to stderr. */
635 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
637 /* Print HSA warning STR with an HSA STATUS code. */
640 hsa_warn (const char *str
, hsa_status_t status
)
645 const char *hsa_error_msg
= "[unknown]";
646 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
648 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
652 /* Report a fatal error STR together with the HSA error corresponding to STATUS
653 and terminate execution of the current process. */
656 hsa_fatal (const char *str
, hsa_status_t status
)
658 const char *hsa_error_msg
= "[unknown]";
659 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
660 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
664 /* Like hsa_fatal, except only report error message, and return FALSE
665 for propagating error processing to outside of plugin. */
668 hsa_error (const char *str
, hsa_status_t status
)
670 const char *hsa_error_msg
= "[unknown]";
671 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
672 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
677 /* Dump information about the available hardware. */
680 dump_hsa_system_info (void)
684 hsa_endianness_t endianness
;
685 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
687 if (status
== HSA_STATUS_SUCCESS
)
690 case HSA_ENDIANNESS_LITTLE
:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
693 case HSA_ENDIANNESS_BIG
:
694 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
700 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
702 uint8_t extensions
[128];
703 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
705 if (status
== HSA_STATUS_SUCCESS
)
707 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
708 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
711 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
714 /* Dump information about the available hardware. */
717 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
719 switch (machine_model
)
721 case HSA_MACHINE_MODEL_SMALL
:
722 GCN_DEBUG ("%s: SMALL\n", s
);
724 case HSA_MACHINE_MODEL_LARGE
:
725 GCN_DEBUG ("%s: LARGE\n", s
);
728 GCN_WARNING ("%s: UNKNOWN\n", s
);
733 /* Dump information about the available hardware. */
736 dump_profile (hsa_profile_t profile
, const char *s
)
740 case HSA_PROFILE_FULL
:
741 GCN_DEBUG ("%s: FULL\n", s
);
743 case HSA_PROFILE_BASE
:
744 GCN_DEBUG ("%s: BASE\n", s
);
747 GCN_WARNING ("%s: UNKNOWN\n", s
);
752 /* Dump information about a device memory region. */
755 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
759 hsa_region_segment_t segment
;
760 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
762 if (status
== HSA_STATUS_SUCCESS
)
764 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
766 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
768 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
769 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
770 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
771 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
776 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
778 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
782 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
784 if (status
== HSA_STATUS_SUCCESS
)
786 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
788 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
789 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
790 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
791 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
794 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
798 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
799 if (status
== HSA_STATUS_SUCCESS
)
800 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
802 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
805 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
807 if (status
== HSA_STATUS_SUCCESS
)
808 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
810 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
814 = hsa_fns
.hsa_region_get_info_fn (region
,
815 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
817 if (status
== HSA_STATUS_SUCCESS
)
818 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
820 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
822 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
823 return HSA_STATUS_SUCCESS
;
826 = hsa_fns
.hsa_region_get_info_fn (region
,
827 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
829 if (status
== HSA_STATUS_SUCCESS
)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
836 = hsa_fns
.hsa_region_get_info_fn (region
,
837 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
839 if (status
== HSA_STATUS_SUCCESS
)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
844 return HSA_STATUS_SUCCESS
;
847 /* Dump information about all the device memory regions. */
850 dump_hsa_regions (hsa_agent_t agent
)
853 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
856 if (status
!= HSA_STATUS_SUCCESS
)
857 hsa_error ("Dumping hsa regions failed", status
);
860 /* Dump information about the available devices. */
863 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
868 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
870 if (status
== HSA_STATUS_SUCCESS
)
871 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
873 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
875 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
877 if (status
== HSA_STATUS_SUCCESS
)
878 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
880 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
882 hsa_machine_model_t machine_model
;
884 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
886 if (status
== HSA_STATUS_SUCCESS
)
887 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
889 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
891 hsa_profile_t profile
;
892 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
894 if (status
== HSA_STATUS_SUCCESS
)
895 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
897 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
899 hsa_device_type_t device_type
;
900 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
902 if (status
== HSA_STATUS_SUCCESS
)
906 case HSA_DEVICE_TYPE_CPU
:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
909 case HSA_DEVICE_TYPE_GPU
:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
912 case HSA_DEVICE_TYPE_DSP
:
913 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
916 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
921 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
924 status
= hsa_fns
.hsa_agent_get_info_fn
925 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
926 if (status
== HSA_STATUS_SUCCESS
)
927 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
929 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
932 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
934 if (status
== HSA_STATUS_SUCCESS
)
935 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
937 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
940 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
941 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
943 if (status
== HSA_STATUS_SUCCESS
)
944 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
946 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
949 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
950 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
952 if (status
== HSA_STATUS_SUCCESS
)
953 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
955 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
957 uint32_t grid_max_dim
;
958 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
960 if (status
== HSA_STATUS_SUCCESS
)
961 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
963 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
965 uint32_t grid_max_size
;
966 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
968 if (status
== HSA_STATUS_SUCCESS
)
969 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
971 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
973 dump_hsa_regions (agent
);
975 return HSA_STATUS_SUCCESS
;
978 /* Forward reference. */
980 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
982 /* Helper function for dump_executable_symbols. */
985 dump_executable_symbol (hsa_executable_t executable
,
986 hsa_executable_symbol_t symbol
,
987 void *data
__attribute__((unused
)))
989 char *name
= get_executable_symbol_name (symbol
);
993 GCN_DEBUG ("executable symbol: %s\n", name
);
997 return HSA_STATUS_SUCCESS
;
1000 /* Dump all global symbol in an executable. */
1003 dump_executable_symbols (hsa_executable_t executable
)
1005 hsa_status_t status
;
1007 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1008 dump_executable_symbol
,
1010 if (status
!= HSA_STATUS_SUCCESS
)
1011 hsa_fatal ("Could not dump HSA executable symbols", status
);
1014 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1017 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1019 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1021 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1022 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1023 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1024 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1025 (void*)kernargs
->abi
.heap_ptr
);
1026 fprintf (stderr
, "%*sarena address: %p (%d bytes per workgroup)\n", indent
,
1027 "", (void*)kernargs
->abi
.arena_ptr
,
1028 kernargs
->abi
.arena_size_per_team
);
1029 fprintf (stderr
, "%*sstack address: %p (%d bytes per wavefront)\n", indent
,
1030 "", (void*)kernargs
->abi
.stack_ptr
,
1031 kernargs
->abi
.stack_size_per_thread
);
1032 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1033 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1034 dispatch
->private_segment_size
);
1035 fprintf (stderr
, "%*sgroup_segment_size: %u (low-latency pool)\n", indent
,
1036 "", dispatch
->group_segment_size
);
1037 fprintf (stderr
, "\n");
1041 /* {{{ Utility functions */
1043 /* Cast the thread local storage to gcn_thread. */
1045 static inline struct gcn_thread
*
1048 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1051 /* Initialize debug and suppress_host_fallback according to the environment. */
1054 init_environment_variables (void)
1056 if (secure_getenv ("GCN_DEBUG"))
1061 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1062 suppress_host_fallback
= true;
1064 suppress_host_fallback
= false;
1066 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1067 if (hsa_runtime_lib
== NULL
)
1068 hsa_runtime_lib
= "libhsa-runtime64.so.1";
1070 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1072 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1074 x
= secure_getenv ("GCN_NUM_GANGS");
1076 override_x_dim
= atoi (x
);
1078 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1080 z
= secure_getenv ("GCN_NUM_WORKERS");
1082 override_z_dim
= atoi (z
);
1084 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1087 size_t tmp
= atol (heap
);
1089 gcn_kernel_heap_size
= tmp
;
1092 const char *arena
= secure_getenv ("GCN_TEAM_ARENA_SIZE");
1095 int tmp
= atoi (arena
);
1097 team_arena_size
= tmp
;;
1100 const char *stack
= secure_getenv ("GCN_STACK_SIZE");
1103 int tmp
= atoi (stack
);
1108 const char *lowlat
= secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1110 lowlat_size
= atoi (lowlat
);
1113 /* Return malloc'd string with name of SYMBOL. */
1116 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1118 hsa_status_t status
;
1121 const hsa_executable_symbol_info_t info_name_length
1122 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1124 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1126 if (status
!= HSA_STATUS_SUCCESS
)
1128 hsa_error ("Could not get length of symbol name", status
);
1132 res
= GOMP_PLUGIN_malloc (len
+ 1);
1134 const hsa_executable_symbol_info_t info_name
1135 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1137 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1139 if (status
!= HSA_STATUS_SUCCESS
)
1141 hsa_error ("Could not get symbol name", status
);
1151 /* Get the number of GPU Compute Units. */
1154 get_cu_count (struct agent_info
*agent
)
1157 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1158 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1159 if (status
== HSA_STATUS_SUCCESS
)
1162 return 64; /* The usual number for older devices. */
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1169 limit_worker_threads (int threads
)
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1179 /* This sets the maximum number of teams to twice the number of GPU Compute
1180 Units to avoid memory waste and corresponding memory access faults. */
1183 limit_teams (int teams
, struct agent_info
*agent
)
1185 int max_teams
= 2 * get_cu_count (agent
);
1186 if (teams
> max_teams
)
1191 /* Parse the target attributes INPUT provided by the compiler and return true
1192 if we should run anything all. If INPUT is NULL, fill DEF with default
1193 values, then store INPUT or DEF into *RESULT.
1195 This is used for OpenMP only. */
1198 parse_target_attributes (void **input
,
1199 struct GOMP_kernel_launch_attributes
*def
,
1200 struct GOMP_kernel_launch_attributes
**result
,
1201 struct agent_info
*agent
)
1204 GOMP_PLUGIN_fatal ("No target arguments provided");
1206 bool grid_attrs_found
= false;
1207 bool gcn_dims_found
= false;
1209 int gcn_threads
= 0;
1212 intptr_t id
= (intptr_t) *input
++, val
;
1214 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1215 val
= (intptr_t) *input
++;
1217 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1219 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1221 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1222 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1223 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1225 grid_attrs_found
= true;
1228 else if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
)
1229 == GOMP_TARGET_ARG_DEVICE_ALL
)
1231 gcn_dims_found
= true;
1232 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1234 case GOMP_TARGET_ARG_NUM_TEAMS
:
1235 gcn_teams
= limit_teams (val
, agent
);
1237 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1238 gcn_threads
= limit_worker_threads (val
);
1248 bool gfx900_workaround_p
= false;
1250 if (agent
->device_isa
== EF_AMDGPU_MACH_AMDGCN_GFX900
1251 && gcn_threads
== 0 && override_z_dim
== 0)
1253 gfx900_workaround_p
= true;
1254 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1255 "threads to at most 4 per team.\n");
1256 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1257 "GCN_NUM_THREADS=16\n");
1260 /* Ideally, when a dimension isn't explicitly specified, we should
1261 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1262 In practice, we tune for peak performance on BabelStream, which
1263 for OpenACC is currently 32 threads per CU. */
1265 if (gcn_teams
<= 0 && gcn_threads
<= 0)
1267 /* Set up a reasonable number of teams and threads. */
1268 gcn_threads
= gfx900_workaround_p
? 4 : 16; // 8;
1269 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1270 def
->gdims
[2] = gcn_threads
;
1272 else if (gcn_teams
<= 0 && gcn_threads
> 0)
1274 /* Auto-scale the number of teams with the number of threads. */
1275 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1276 def
->gdims
[2] = gcn_threads
;
1278 else if (gcn_teams
> 0 && gcn_threads
<= 0)
1280 int max_threads
= gfx900_workaround_p
? 4 : 16;
1282 /* Auto-scale the number of threads with the number of teams. */
1283 def
->gdims
[0] = gcn_teams
;
1284 def
->gdims
[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1285 if (def
->gdims
[2] == 0)
1287 else if (def
->gdims
[2] > max_threads
)
1288 def
->gdims
[2] = max_threads
;
1292 def
->gdims
[0] = gcn_teams
;
1293 def
->gdims
[2] = gcn_threads
;
1295 def
->gdims
[1] = 64; /* Each thread is 64 work items wide. */
1296 def
->wdims
[0] = 1; /* Single team per work-group. */
1302 else if (!grid_attrs_found
)
1312 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1316 struct GOMP_kernel_launch_attributes
*kla
;
1317 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1319 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1320 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1322 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1324 for (i
= 0; i
< kla
->ndim
; i
++)
1326 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1327 kla
->gdims
[i
], kla
->wdims
[i
]);
1328 if (kla
->gdims
[i
] == 0)
1334 /* Return the group size given the requested GROUP size, GRID size and number
1335 of grid dimensions NDIM. */
1338 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1342 /* TODO: Provide a default via environment or device characteristics. */
1356 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1359 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1361 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1364 /* A never-called callback for the HSA command queues. These signal events
1365 that we don't use, so we trigger an error.
1367 This "queue" is not to be confused with the async queues, below. */
1370 hsa_queue_callback (hsa_status_t status
,
1371 hsa_queue_t
*queue
__attribute__ ((unused
)),
1372 void *data
__attribute__ ((unused
)))
1374 hsa_fatal ("Asynchronous queue error", status
);
1378 /* {{{ HSA initialization */
1380 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1383 init_hsa_runtime_functions (void)
1385 #define DLSYM_FN(function) \
1386 hsa_fns.function##_fn = dlsym (handle, #function); \
1387 if (hsa_fns.function##_fn == NULL) \
1388 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1389 #define DLSYM_OPT_FN(function) \
1390 hsa_fns.function##_fn = dlsym (handle, #function);
1392 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1396 DLSYM_FN (hsa_status_string
)
1397 DLSYM_FN (hsa_system_get_info
)
1398 DLSYM_FN (hsa_agent_get_info
)
1400 DLSYM_FN (hsa_iterate_agents
)
1401 DLSYM_FN (hsa_region_get_info
)
1402 DLSYM_FN (hsa_queue_create
)
1403 DLSYM_FN (hsa_agent_iterate_regions
)
1404 DLSYM_FN (hsa_executable_destroy
)
1405 DLSYM_FN (hsa_executable_create
)
1406 DLSYM_FN (hsa_executable_global_variable_define
)
1407 DLSYM_FN (hsa_executable_load_code_object
)
1408 DLSYM_FN (hsa_executable_freeze
)
1409 DLSYM_FN (hsa_signal_create
)
1410 DLSYM_FN (hsa_memory_allocate
)
1411 DLSYM_FN (hsa_memory_assign_agent
)
1412 DLSYM_FN (hsa_memory_copy
)
1413 DLSYM_FN (hsa_memory_free
)
1414 DLSYM_FN (hsa_signal_destroy
)
1415 DLSYM_FN (hsa_executable_get_symbol
)
1416 DLSYM_FN (hsa_executable_symbol_get_info
)
1417 DLSYM_FN (hsa_executable_iterate_symbols
)
1418 DLSYM_FN (hsa_queue_add_write_index_release
)
1419 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1420 DLSYM_FN (hsa_signal_wait_acquire
)
1421 DLSYM_FN (hsa_signal_store_relaxed
)
1422 DLSYM_FN (hsa_signal_store_release
)
1423 DLSYM_FN (hsa_signal_load_acquire
)
1424 DLSYM_FN (hsa_queue_destroy
)
1425 DLSYM_FN (hsa_code_object_deserialize
)
1426 DLSYM_OPT_FN (hsa_amd_memory_lock
)
1427 DLSYM_OPT_FN (hsa_amd_memory_unlock
)
1428 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect
)
1434 static gcn_isa
isa_code (const char *isa
);
1436 /* Return true if the agent is a GPU and can accept of concurrent submissions
1437 from different threads. */
1440 suitable_hsa_agent_p (hsa_agent_t agent
)
1442 hsa_device_type_t device_type
;
1444 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1446 if (status
!= HSA_STATUS_SUCCESS
)
1449 switch (device_type
)
1451 case HSA_DEVICE_TYPE_GPU
:
1455 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
, name
);
1456 if (status
!= HSA_STATUS_SUCCESS
1457 || isa_code (name
) == EF_AMDGPU_MACH_UNSUPPORTED
)
1459 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1460 status
== HSA_STATUS_SUCCESS
? name
: "invalid");
1465 case HSA_DEVICE_TYPE_CPU
:
1466 if (!support_cpu_devices
)
1473 uint32_t features
= 0;
1474 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1476 if (status
!= HSA_STATUS_SUCCESS
1477 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1479 hsa_queue_type_t queue_type
;
1480 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1482 if (status
!= HSA_STATUS_SUCCESS
1483 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1489 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1490 agent_count in hsa_context. */
1493 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1495 if (suitable_hsa_agent_p (agent
))
1496 hsa_context
.agent_count
++;
1497 return HSA_STATUS_SUCCESS
;
1500 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1501 id to the describing structure in the hsa context. The index of the
1502 structure is pointed to by DATA, increment it afterwards. */
1505 assign_agent_ids (hsa_agent_t agent
, void *data
)
1507 if (suitable_hsa_agent_p (agent
))
1509 int *agent_index
= (int *) data
;
1510 hsa_context
.agents
[*agent_index
].id
= agent
;
1513 return HSA_STATUS_SUCCESS
;
1516 /* Initialize hsa_context if it has not already been done.
1517 If !PROBE: returns TRUE on success.
1518 If PROBE: returns TRUE on success or if the plugin/device shall be silently
1519 ignored, and otherwise emits an error and returns FALSE. */
1522 init_hsa_context (bool probe
)
1524 hsa_status_t status
;
1525 int agent_index
= 0;
1527 if (hsa_context
.initialized
)
1529 init_environment_variables ();
1530 if (!init_hsa_runtime_functions ())
1532 const char *msg
= "Run-time could not be dynamically opened";
1533 if (suppress_host_fallback
)
1534 GOMP_PLUGIN_fatal ("%s\n", msg
);
1536 GCN_WARNING ("%s\n", msg
);
1537 return probe
? true : false;
1539 status
= hsa_fns
.hsa_init_fn ();
1540 if (status
!= HSA_STATUS_SUCCESS
)
1541 return hsa_error ("Run-time could not be initialized", status
);
1542 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1545 dump_hsa_system_info ();
1547 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1548 if (status
!= HSA_STATUS_SUCCESS
)
1549 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1550 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1553 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1554 * sizeof (struct agent_info
));
1555 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1556 if (status
!= HSA_STATUS_SUCCESS
)
1557 return hsa_error ("Scanning compute agents failed", status
);
1558 if (agent_index
!= hsa_context
.agent_count
)
1560 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1566 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1567 if (status
!= HSA_STATUS_SUCCESS
)
1568 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1571 uint16_t minor
, major
;
1572 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR
,
1574 if (status
!= HSA_STATUS_SUCCESS
)
1575 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1576 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR
,
1578 if (status
!= HSA_STATUS_SUCCESS
)
1579 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1581 size_t len
= sizeof hsa_context
.driver_version_s
;
1582 int printed
= snprintf (hsa_context
.driver_version_s
, len
,
1583 "HSA Runtime %hu.%hu", (unsigned short int)major
,
1584 (unsigned short int)minor
);
1586 GCN_WARNING ("HSA runtime version string was truncated."
1587 "Version %hu.%hu is too long.", (unsigned short int)major
,
1588 (unsigned short int)minor
);
1590 hsa_context
.initialized
= true;
1594 /* Verify that hsa_context has already been initialized and return the
1595 agent_info structure describing device number N. Return NULL on error. */
1597 static struct agent_info
*
1598 get_agent_info (int n
)
1600 if (!hsa_context
.initialized
)
1602 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1605 if (n
>= hsa_context
.agent_count
)
1607 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1610 if (!hsa_context
.agents
[n
].initialized
)
1612 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1615 return &hsa_context
.agents
[n
];
1618 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1620 Selects (breaks at) a suitable region of type KIND. */
1623 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1624 hsa_region_global_flag_t kind
)
1626 hsa_status_t status
;
1627 hsa_region_segment_t segment
;
1629 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1631 if (status
!= HSA_STATUS_SUCCESS
)
1633 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1634 return HSA_STATUS_SUCCESS
;
1637 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1639 if (status
!= HSA_STATUS_SUCCESS
)
1644 return HSA_STATUS_INFO_BREAK
;
1646 return HSA_STATUS_SUCCESS
;
1649 /* Callback of hsa_agent_iterate_regions.
1651 Selects a kernargs memory region. */
1654 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1656 return get_memory_region (region
, (hsa_region_t
*)data
,
1657 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1660 /* Callback of hsa_agent_iterate_regions.
1662 Selects a coarse-grained memory region suitable for the heap and
1666 get_data_memory_region (hsa_region_t region
, void *data
)
1668 return get_memory_region (region
, (hsa_region_t
*)data
,
1669 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1673 elf_gcn_isa_field (Elf64_Ehdr
*image
)
1675 return image
->e_flags
& EF_AMDGPU_MACH_MASK
;
1678 const static char *gcn_gfx803_s
= "gfx803";
1679 const static char *gcn_gfx900_s
= "gfx900";
1680 const static char *gcn_gfx906_s
= "gfx906";
1681 const static char *gcn_gfx908_s
= "gfx908";
1682 const static char *gcn_gfx90a_s
= "gfx90a";
1683 const static char *gcn_gfx90c_s
= "gfx90c";
1684 const static char *gcn_gfx1030_s
= "gfx1030";
1685 const static char *gcn_gfx1036_s
= "gfx1036";
1686 const static char *gcn_gfx1100_s
= "gfx1100";
1687 const static char *gcn_gfx1103_s
= "gfx1103";
1688 const static int gcn_isa_name_len
= 7;
1690 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1694 isa_hsa_name (int isa
) {
1697 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1698 return gcn_gfx803_s
;
1699 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1700 return gcn_gfx900_s
;
1701 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1702 return gcn_gfx906_s
;
1703 case EF_AMDGPU_MACH_AMDGCN_GFX908
:
1704 return gcn_gfx908_s
;
1705 case EF_AMDGPU_MACH_AMDGCN_GFX90a
:
1706 return gcn_gfx90a_s
;
1707 case EF_AMDGPU_MACH_AMDGCN_GFX90c
:
1708 return gcn_gfx90c_s
;
1709 case EF_AMDGPU_MACH_AMDGCN_GFX1030
:
1710 return gcn_gfx1030_s
;
1711 case EF_AMDGPU_MACH_AMDGCN_GFX1036
:
1712 return gcn_gfx1036_s
;
1713 case EF_AMDGPU_MACH_AMDGCN_GFX1100
:
1714 return gcn_gfx1100_s
;
1715 case EF_AMDGPU_MACH_AMDGCN_GFX1103
:
1716 return gcn_gfx1103_s
;
1721 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1722 with -march) or NULL if we do not support the ISA.
1723 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1726 isa_gcc_name (int isa
) {
1729 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1732 return isa_hsa_name (isa
);
1736 /* Returns the code which is used in the GCN object code to identify the ISA with
1737 the given name (as used by the HSA runtime). */
1740 isa_code(const char *isa
) {
1741 if (!strncmp (isa
, gcn_gfx803_s
, gcn_isa_name_len
))
1742 return EF_AMDGPU_MACH_AMDGCN_GFX803
;
1744 if (!strncmp (isa
, gcn_gfx900_s
, gcn_isa_name_len
))
1745 return EF_AMDGPU_MACH_AMDGCN_GFX900
;
1747 if (!strncmp (isa
, gcn_gfx906_s
, gcn_isa_name_len
))
1748 return EF_AMDGPU_MACH_AMDGCN_GFX906
;
1750 if (!strncmp (isa
, gcn_gfx908_s
, gcn_isa_name_len
))
1751 return EF_AMDGPU_MACH_AMDGCN_GFX908
;
1753 if (!strncmp (isa
, gcn_gfx90a_s
, gcn_isa_name_len
))
1754 return EF_AMDGPU_MACH_AMDGCN_GFX90a
;
1756 if (!strncmp (isa
, gcn_gfx90c_s
, gcn_isa_name_len
))
1757 return EF_AMDGPU_MACH_AMDGCN_GFX90c
;
1759 if (!strncmp (isa
, gcn_gfx1030_s
, gcn_isa_name_len
))
1760 return EF_AMDGPU_MACH_AMDGCN_GFX1030
;
1762 if (!strncmp (isa
, gcn_gfx1036_s
, gcn_isa_name_len
))
1763 return EF_AMDGPU_MACH_AMDGCN_GFX1036
;
1765 if (!strncmp (isa
, gcn_gfx1100_s
, gcn_isa_name_len
))
1766 return EF_AMDGPU_MACH_AMDGCN_GFX1100
;
1768 if (!strncmp (isa
, gcn_gfx1103_s
, gcn_isa_name_len
))
1769 return EF_AMDGPU_MACH_AMDGCN_GFX1103
;
1771 return EF_AMDGPU_MACH_UNSUPPORTED
;
1774 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1777 max_isa_vgprs (int isa
)
1781 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1782 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1783 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1784 case EF_AMDGPU_MACH_AMDGCN_GFX908
:
1786 case EF_AMDGPU_MACH_AMDGCN_GFX90a
:
1788 case EF_AMDGPU_MACH_AMDGCN_GFX90c
:
1790 case EF_AMDGPU_MACH_AMDGCN_GFX1030
:
1791 case EF_AMDGPU_MACH_AMDGCN_GFX1036
:
1792 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1793 case EF_AMDGPU_MACH_AMDGCN_GFX1100
:
1794 case EF_AMDGPU_MACH_AMDGCN_GFX1103
:
1795 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
1797 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1803 /* Create or reuse a team arena and stack space.
1805 Team arenas are used by OpenMP to avoid calling malloc multiple times
1806 while setting up each team. This is purely a performance optimization.
1808 The stack space is used by all kernels. We must allocate it in such a
1809 way that the reverse offload implmentation can access the data.
1811 Allocating this memory costs performance, so this function will reuse an
1812 existing allocation if a large enough one is idle.
1813 The memory lock is released, but not deallocated, when the kernel exits. */
1816 configure_ephemeral_memories (struct kernel_info
*kernel
,
1817 struct kernargs_abi
*kernargs
, int num_teams
,
1820 struct agent_info
*agent
= kernel
->agent
;
1821 struct ephemeral_memories_list
**next_ptr
= &agent
->ephemeral_memories_list
;
1822 struct ephemeral_memories_list
*item
;
1824 int actual_arena_size
= (kernel
->kind
== KIND_OPENMP
1825 ? team_arena_size
: 0);
1826 int actual_arena_total_size
= actual_arena_size
* num_teams
;
1827 size_t size
= (actual_arena_total_size
1828 + num_teams
* num_threads
* stack_size
);
1830 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1832 if (item
->size
< size
)
1835 if (pthread_mutex_trylock (&item
->in_use
) == 0)
1841 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1842 " (%zd bytes)\n", (actual_arena_size
? "arena and " : ""),
1843 num_teams
, num_threads
, size
);
1845 if (pthread_mutex_lock (&agent
->ephemeral_memories_write_lock
))
1847 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1850 item
= malloc (sizeof (*item
));
1855 if (pthread_mutex_init (&item
->in_use
, NULL
))
1857 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1860 if (pthread_mutex_lock (&item
->in_use
))
1862 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1865 if (pthread_mutex_unlock (&agent
->ephemeral_memories_write_lock
))
1867 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1871 hsa_status_t status
;
1872 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
, size
,
1874 if (status
!= HSA_STATUS_SUCCESS
)
1875 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1876 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->address
, agent
->id
,
1877 HSA_ACCESS_PERMISSION_RW
);
1878 if (status
!= HSA_STATUS_SUCCESS
)
1879 hsa_fatal ("Could not assign arena & stack memory to device", status
);
1882 kernargs
->arena_ptr
= (actual_arena_total_size
1883 ? (uint64_t)item
->address
1885 kernargs
->stack_ptr
= (uint64_t)item
->address
+ actual_arena_total_size
;
1886 kernargs
->arena_size_per_team
= actual_arena_size
;
1887 kernargs
->stack_size_per_thread
= stack_size
;
1890 /* Mark an ephemeral memory space available for reuse. */
1893 release_ephemeral_memories (struct agent_info
* agent
, void *address
)
1895 struct ephemeral_memories_list
*item
;
1897 for (item
= agent
->ephemeral_memories_list
; item
; item
= item
->next
)
1899 if (item
->address
== address
)
1901 if (pthread_mutex_unlock (&item
->in_use
))
1902 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1906 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1909 /* Clean up all the allocated team arenas. */
1912 destroy_ephemeral_memories (struct agent_info
*agent
)
1914 struct ephemeral_memories_list
*item
, *next
;
1916 for (item
= agent
->ephemeral_memories_list
; item
; item
= next
)
1919 hsa_fns
.hsa_memory_free_fn (item
->address
);
1920 if (pthread_mutex_destroy (&item
->in_use
))
1922 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1927 agent
->ephemeral_memories_list
= NULL
;
1932 /* Allocate memory on a specified device. */
1935 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1937 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1940 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1942 if (status
!= HSA_STATUS_SUCCESS
)
1944 hsa_error ("Could not allocate device memory", status
);
1948 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1949 HSA_ACCESS_PERMISSION_RW
);
1950 if (status
!= HSA_STATUS_SUCCESS
)
1952 hsa_error ("Could not assign data memory to device", status
);
1956 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1957 bool profiling_dispatch_p
1958 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1959 if (profiling_dispatch_p
)
1961 acc_prof_info
*prof_info
= thr
->prof_info
;
1962 acc_event_info data_event_info
;
1963 acc_api_info
*api_info
= thr
->api_info
;
1965 prof_info
->event_type
= acc_ev_alloc
;
1967 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1968 data_event_info
.data_event
.valid_bytes
1969 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1970 data_event_info
.data_event
.parent_construct
1971 = acc_construct_parallel
;
1972 data_event_info
.data_event
.implicit
= 1;
1973 data_event_info
.data_event
.tool_info
= NULL
;
1974 data_event_info
.data_event
.var_name
= NULL
;
1975 data_event_info
.data_event
.bytes
= size
;
1976 data_event_info
.data_event
.host_ptr
= NULL
;
1977 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1979 api_info
->device_api
= acc_device_api_other
;
1981 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1988 /* Create kernel dispatch data structure for given KERNEL, along with
1989 the necessary device signals and memory allocations. */
1991 static struct kernel_dispatch
*
1992 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
,
1995 struct agent_info
*agent
= kernel
->agent
;
1996 struct kernel_dispatch
*shadow
1997 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1999 shadow
->agent
= kernel
->agent
;
2000 shadow
->object
= kernel
->object
;
2002 hsa_signal_t sync_signal
;
2003 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
2004 if (status
!= HSA_STATUS_SUCCESS
)
2005 hsa_fatal ("Error creating the GCN sync signal", status
);
2007 shadow
->signal
= sync_signal
.handle
;
2008 shadow
->private_segment_size
= kernel
->private_segment_size
;
2010 if (lowlat_size
< 0)
2012 /* Divide the LDS between the number of running teams.
2013 Allocate not less than is defined in the kernel metadata. */
2014 int teams_per_cu
= num_teams
/ get_cu_count (agent
);
2015 int LDS_per_team
= (teams_per_cu
? 65536 / teams_per_cu
: 65536);
2016 shadow
->group_segment_size
2017 = (kernel
->group_segment_size
> LDS_per_team
2018 ? kernel
->group_segment_size
2021 else if (lowlat_size
< GCN_LOWLAT_HEAP
+8)
2022 /* Ensure that there's space for the OpenMP libgomp data. */
2023 shadow
->group_segment_size
= GCN_LOWLAT_HEAP
+8;
2025 shadow
->group_segment_size
= (lowlat_size
> 65536
2029 /* We expect kernels to request a single pointer, explicitly, and the
2030 rest of struct kernargs, implicitly. If they request anything else
2031 then something is wrong. */
2032 if (kernel
->kernarg_segment_size
> 8)
2034 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2038 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
2039 sizeof (struct kernargs
),
2040 &shadow
->kernarg_address
);
2041 if (status
!= HSA_STATUS_SUCCESS
)
2042 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
2043 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2045 /* Zero-initialize the output_data (minimum needed). */
2046 kernargs
->abi
.out_ptr
= (int64_t)&kernargs
->output_data
;
2047 kernargs
->output_data
.next_output
= 0;
2048 for (unsigned i
= 0;
2049 i
< (sizeof (kernargs
->output_data
.queue
)
2050 / sizeof (kernargs
->output_data
.queue
[0]));
2052 kernargs
->output_data
.queue
[i
].written
= 0;
2053 kernargs
->output_data
.consumed
= 0;
2055 /* Pass in the heap location. */
2056 kernargs
->abi
.heap_ptr
= (int64_t)kernel
->module
->heap
;
2058 /* Create the ephemeral memory spaces. */
2059 configure_ephemeral_memories (kernel
, &kernargs
->abi
, num_teams
, num_threads
);
2061 /* Ensure we can recognize unset return values. */
2062 kernargs
->output_data
.return_value
= 0xcafe0000;
2068 process_reverse_offload (uint64_t fn
, uint64_t mapnum
, uint64_t hostaddrs
,
2069 uint64_t sizes
, uint64_t kinds
, uint64_t dev_num64
)
2071 int dev_num
= dev_num64
;
2072 GOMP_PLUGIN_target_rev (fn
, mapnum
, hostaddrs
, sizes
, kinds
, dev_num
,
2076 /* Output any data written to console output from the kernel. It is expected
2077 that this function is polled during kernel execution.
2079 We print all entries from the last item printed to the next entry without
2080 a "written" flag. If the "final" flag is set then it'll continue right to
2083 The print buffer is circular, but the from and to locations don't wrap when
2084 the buffer does, so the output limit is UINT_MAX. The target blocks on
2085 output when the buffer is full. */
2088 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
2091 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
2092 / sizeof (kernargs
->output_data
.queue
[0]));
2094 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
2096 unsigned int to
= kernargs
->output_data
.next_output
;
2102 printf ("GCN print buffer overflowed.\n");
2107 for (i
= from
; i
< to
; i
++)
2109 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
2111 if (!data
->written
&& !final
)
2116 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
2117 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
2118 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
2119 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
2121 process_reverse_offload (data
->value_u64
[0], data
->value_u64
[1],
2122 data
->value_u64
[2], data
->value_u64
[3],
2123 data
->value_u64
[4], data
->value_u64
[5]);
2125 default: printf ("GCN print buffer error!\n"); break;
2128 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
2134 /* Release data structure created for a kernel dispatch in SHADOW argument,
2135 and clean up the signal and memory allocations. */
2138 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
2140 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
2142 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2143 void *addr
= (void *)kernargs
->abi
.arena_ptr
;
2145 addr
= (void *)kernargs
->abi
.stack_ptr
;
2146 release_ephemeral_memories (shadow
->agent
, addr
);
2148 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
2151 s
.handle
= shadow
->signal
;
2152 hsa_fns
.hsa_signal_destroy_fn (s
);
2157 /* Extract the properties from a kernel binary. */
2160 init_kernel_properties (struct kernel_info
*kernel
)
2162 hsa_status_t status
;
2163 struct agent_info
*agent
= kernel
->agent
;
2164 hsa_executable_symbol_t kernel_symbol
;
2165 char *buf
= alloca (strlen (kernel
->name
) + 4);
2166 sprintf (buf
, "%s.kd", kernel
->name
);
2167 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
2170 if (status
!= HSA_STATUS_SUCCESS
)
2172 hsa_warn ("Could not find symbol for kernel in the code object", status
);
2173 fprintf (stderr
, "not found name: '%s'\n", buf
);
2174 dump_executable_symbols (agent
->executable
);
2177 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
2178 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2179 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
2180 if (status
!= HSA_STATUS_SUCCESS
)
2181 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
2182 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2183 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
2184 &kernel
->kernarg_segment_size
);
2185 if (status
!= HSA_STATUS_SUCCESS
)
2186 hsa_fatal ("Could not get info about kernel argument size", status
);
2187 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2188 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
2189 &kernel
->group_segment_size
);
2190 if (status
!= HSA_STATUS_SUCCESS
)
2191 hsa_fatal ("Could not get info about kernel group segment size", status
);
2192 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2193 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
2194 &kernel
->private_segment_size
);
2195 if (status
!= HSA_STATUS_SUCCESS
)
2196 hsa_fatal ("Could not get info about kernel private segment size",
2199 /* The kernel type is not known until something tries to launch it. */
2200 kernel
->kind
= KIND_UNKNOWN
;
2202 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2203 "following segment sizes: \n", kernel
->name
);
2204 GCN_DEBUG (" group_segment_size: %u\n",
2205 (unsigned) kernel
->group_segment_size
);
2206 GCN_DEBUG (" private_segment_size: %u\n",
2207 (unsigned) kernel
->private_segment_size
);
2208 GCN_DEBUG (" kernarg_segment_size: %u\n",
2209 (unsigned) kernel
->kernarg_segment_size
);
2213 kernel
->initialization_failed
= true;
2216 /* Do all the work that is necessary before running KERNEL for the first time.
2217 The function assumes the program has been created, finalized and frozen by
2218 create_and_finalize_hsa_program. */
2221 init_kernel (struct kernel_info
*kernel
)
2223 if (pthread_mutex_lock (&kernel
->init_mutex
))
2224 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2225 if (kernel
->initialized
)
2227 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2228 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2234 init_kernel_properties (kernel
);
2236 if (!kernel
->initialization_failed
)
2240 kernel
->initialized
= true;
2242 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2243 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2247 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2248 launch attributes from KLA.
2250 MODULE_LOCKED indicates that the caller already holds the lock and
2251 run_kernel need not lock it again.
2252 If AQ is NULL then agent->sync_queue will be used. */
2255 run_kernel (struct kernel_info
*kernel
, void *vars
,
2256 struct GOMP_kernel_launch_attributes
*kla
,
2257 struct goacc_asyncqueue
*aq
, bool module_locked
)
2259 struct agent_info
*agent
= kernel
->agent
;
2260 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel
->description
->sgpr_count
,
2261 kernel
->description
->vpgr_count
);
2263 /* Reduce the number of threads/workers if there are insufficient
2264 VGPRs available to run the kernels together. */
2265 if (kla
->ndim
== 3 && kernel
->description
->vpgr_count
> 0)
2267 int max_vgprs
= max_isa_vgprs (agent
->device_isa
);
2268 int granulated_vgprs
= (kernel
->description
->vpgr_count
+ 3) & ~3;
2269 int max_threads
= (max_vgprs
/ granulated_vgprs
) * 4;
2270 if (kla
->gdims
[2] > max_threads
)
2272 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2273 " per team/gang - reducing to %d threads/workers.\n",
2274 kla
->gdims
[2], max_threads
);
2275 kla
->gdims
[2] = max_threads
;
2279 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
2281 GCN_DEBUG ("GCN launch attribs: gdims:[");
2283 for (i
= 0; i
< kla
->ndim
; ++i
)
2287 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2289 DEBUG_PRINT ("], normalized gdims:[");
2290 for (i
= 0; i
< kla
->ndim
; ++i
)
2294 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2296 DEBUG_PRINT ("], wdims:[");
2297 for (i
= 0; i
< kla
->ndim
; ++i
)
2301 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2303 DEBUG_PRINT ("]\n");
2306 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2307 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2309 if (!agent
->initialized
)
2310 GOMP_PLUGIN_fatal ("Agent must be initialized");
2312 if (!kernel
->initialized
)
2313 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2315 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2318 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2319 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2321 /* Wait until the queue is not full before writing the packet. */
2322 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2326 /* Do not allow the dimensions to be overridden when running
2327 constructors or destructors. */
2328 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2329 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2331 hsa_kernel_dispatch_packet_t
*packet
;
2332 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2333 + index
% command_q
->size
;
2335 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2336 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2337 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2338 packet
->grid_size_x
,
2343 packet
->grid_size_y
= kla
->gdims
[1];
2344 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2349 packet
->grid_size_y
= 1;
2350 packet
->workgroup_size_y
= 1;
2355 packet
->grid_size_z
= limit_worker_threads (override_z
2357 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2358 packet
->grid_size_z
,
2363 packet
->grid_size_z
= 1;
2364 packet
->workgroup_size_z
= 1;
2367 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2368 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2369 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2370 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2371 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2372 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2373 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2374 packet
->workgroup_size_z
);
2376 struct kernel_dispatch
*shadow
2377 = create_kernel_dispatch (kernel
, packet
->grid_size_x
,
2378 packet
->grid_size_z
);
2379 shadow
->queue
= command_q
;
2383 fprintf (stderr
, "\nKernel has following dependencies:\n");
2384 print_kernel_dispatch (shadow
, 2);
2387 packet
->private_segment_size
= shadow
->private_segment_size
;
2388 packet
->group_segment_size
= shadow
->group_segment_size
;
2389 packet
->kernel_object
= shadow
->object
;
2390 packet
->kernarg_address
= shadow
->kernarg_address
;
2392 s
.handle
= shadow
->signal
;
2393 packet
->completion_signal
= s
;
2394 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2395 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2397 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2400 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2401 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2402 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2404 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2407 packet_store_release ((uint32_t *) packet
, header
,
2408 (uint16_t) kla
->ndim
2409 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2411 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2414 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2416 /* Root signal waits with 1ms timeout. */
2417 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2419 HSA_WAIT_STATE_BLOCKED
) != 0)
2421 console_output (kernel
, shadow
->kernarg_address
, false);
2423 console_output (kernel
, shadow
->kernarg_address
, true);
2425 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2426 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2428 release_kernel_dispatch (shadow
);
2430 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2431 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2433 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2434 if (upper
== 0xcafe)
2435 ; // exit not called, normal termination.
2436 else if (upper
== 0xffff)
2440 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2441 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2446 if (upper
== 0xffff)
2448 unsigned int signal
= (return_value
>> 8) & 0xff;
2450 if (signal
== SIGABRT
)
2452 GCN_WARNING ("GCN Kernel aborted\n");
2455 else if (signal
!= 0)
2457 GCN_WARNING ("GCN Kernel received unknown signal\n");
2461 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2462 exit (return_value
& 0xff);
2467 /* {{{ Load/Unload */
2469 /* Initialize KERNEL from D and other parameters. Return true on success. */
2472 init_basic_kernel_info (struct kernel_info
*kernel
,
2473 struct hsa_kernel_description
*d
,
2474 struct agent_info
*agent
,
2475 struct module_info
*module
)
2477 kernel
->agent
= agent
;
2478 kernel
->module
= module
;
2479 kernel
->name
= d
->name
;
2480 kernel
->description
= d
;
2481 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2483 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2489 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2492 isa_matches_agent (struct agent_info
*agent
, Elf64_Ehdr
*image
)
2494 int isa_field
= elf_gcn_isa_field (image
);
2495 const char* isa_s
= isa_hsa_name (isa_field
);
2498 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR
);
2502 if (isa_field
!= agent
->device_isa
)
2505 const char *agent_isa_s
= isa_hsa_name (agent
->device_isa
);
2506 const char *agent_isa_gcc_s
= isa_gcc_name (agent
->device_isa
);
2507 assert (agent_isa_s
);
2508 assert (agent_isa_gcc_s
);
2510 snprintf (msg
, sizeof msg
,
2511 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2512 "Try to recompile with '-foffload-options=-march=%s'.\n",
2513 isa_s
, agent_isa_s
, agent_isa_gcc_s
);
2515 hsa_error (msg
, HSA_STATUS_ERROR
);
2522 /* Create and finalize the program consisting of all loaded modules. */
2525 create_and_finalize_hsa_program (struct agent_info
*agent
)
2527 hsa_status_t status
;
2529 if (pthread_mutex_lock (&agent
->prog_mutex
))
2531 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2534 if (agent
->prog_finalized
)
2538 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2539 HSA_EXECUTABLE_STATE_UNFROZEN
,
2540 "", &agent
->executable
);
2541 if (status
!= HSA_STATUS_SUCCESS
)
2543 hsa_error ("Could not create GCN executable", status
);
2547 /* Load any GCN modules. */
2548 struct module_info
*module
= agent
->module
;
2551 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2553 if (!isa_matches_agent (agent
, image
))
2556 hsa_code_object_t co
= { 0 };
2557 status
= hsa_fns
.hsa_code_object_deserialize_fn
2558 (module
->image_desc
->gcn_image
->image
,
2559 module
->image_desc
->gcn_image
->size
,
2561 if (status
!= HSA_STATUS_SUCCESS
)
2563 hsa_error ("Could not deserialize GCN code object", status
);
2567 status
= hsa_fns
.hsa_executable_load_code_object_fn
2568 (agent
->executable
, agent
->id
, co
, "");
2569 if (status
!= HSA_STATUS_SUCCESS
)
2571 hsa_error ("Could not load GCN code object", status
);
2577 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2578 gcn_kernel_heap_size
,
2579 (void**)&module
->heap
);
2580 if (status
!= HSA_STATUS_SUCCESS
)
2582 hsa_error ("Could not allocate memory for GCN heap", status
);
2586 status
= hsa_fns
.hsa_memory_assign_agent_fn
2587 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2588 if (status
!= HSA_STATUS_SUCCESS
)
2590 hsa_error ("Could not assign GCN heap memory to device", status
);
2594 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2595 &gcn_kernel_heap_size
,
2596 sizeof (gcn_kernel_heap_size
));
2602 dump_executable_symbols (agent
->executable
);
2604 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2605 if (status
!= HSA_STATUS_SUCCESS
)
2607 hsa_error ("Could not freeze the GCN executable", status
);
2612 agent
->prog_finalized
= true;
2614 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2616 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2627 /* Free the HSA program in agent and everything associated with it and set
2628 agent->prog_finalized and the initialized flags of all kernels to false.
2629 Return TRUE on success. */
2632 destroy_hsa_program (struct agent_info
*agent
)
2634 if (!agent
->prog_finalized
)
2637 hsa_status_t status
;
2639 GCN_DEBUG ("Destroying the current GCN program.\n");
2641 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2642 if (status
!= HSA_STATUS_SUCCESS
)
2643 return hsa_error ("Could not destroy GCN executable", status
);
2648 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2649 agent
->module
->kernels
[i
].initialized
= false;
2651 if (agent
->module
->heap
)
2653 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2654 agent
->module
->heap
= NULL
;
2657 agent
->prog_finalized
= false;
2661 /* Deinitialize all information associated with MODULE and kernels within
2662 it. Return TRUE on success. */
2665 destroy_module (struct module_info
*module
, bool locked
)
2667 /* Run destructors before destroying module. */
2668 struct GOMP_kernel_launch_attributes kla
=
2672 /* Work-group size. */
2676 if (module
->fini_array_func
)
2678 init_kernel (module
->fini_array_func
);
2679 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2681 module
->constructors_run_p
= false;
2684 for (i
= 0; i
< module
->kernel_count
; i
++)
2685 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2687 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2698 /* Callback of dispatch queues to report errors. */
2701 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2703 struct queue_entry
*entry
= &aq
->queue
[index
];
2705 switch (entry
->type
)
2709 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2710 aq
->agent
->device_id
, aq
->id
, index
);
2711 run_kernel (entry
->u
.launch
.kernel
,
2712 entry
->u
.launch
.vars
,
2713 &entry
->u
.launch
.kla
, aq
, false);
2715 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2716 aq
->agent
->device_id
, aq
->id
, index
);
2721 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2722 aq
->agent
->device_id
, aq
->id
, index
);
2723 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2725 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2726 aq
->agent
->device_id
, aq
->id
, index
);
2731 /* FIXME: is it safe to access a placeholder that may already have
2733 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2736 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2737 aq
->agent
->device_id
, aq
->id
, index
);
2739 pthread_mutex_lock (&placeholderp
->mutex
);
2741 while (!placeholderp
->executed
)
2742 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2744 pthread_mutex_unlock (&placeholderp
->mutex
);
2746 if (pthread_cond_destroy (&placeholderp
->cond
))
2747 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2749 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2750 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2753 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2754 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2758 case ASYNC_PLACEHOLDER
:
2759 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2760 entry
->u
.placeholder
.executed
= 1;
2761 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2762 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2766 GOMP_PLUGIN_fatal ("Unknown queue element");
2770 /* This function is run as a thread to service an async queue in the
2771 background. It runs continuously until the stop flag is set. */
2774 drain_queue (void *thread_arg
)
2776 struct goacc_asyncqueue
*aq
= thread_arg
;
2778 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2780 aq
->drain_queue_stop
= 2;
2784 pthread_mutex_lock (&aq
->mutex
);
2788 if (aq
->drain_queue_stop
)
2791 if (aq
->queue_n
> 0)
2793 pthread_mutex_unlock (&aq
->mutex
);
2794 execute_queue_entry (aq
, aq
->queue_first
);
2796 pthread_mutex_lock (&aq
->mutex
);
2797 aq
->queue_first
= ((aq
->queue_first
+ 1)
2798 % ASYNC_QUEUE_SIZE
);
2801 if (DEBUG_THREAD_SIGNAL
)
2802 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2803 aq
->agent
->device_id
, aq
->id
);
2804 pthread_cond_broadcast (&aq
->queue_cond_out
);
2805 pthread_mutex_unlock (&aq
->mutex
);
2808 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2810 pthread_mutex_lock (&aq
->mutex
);
2814 if (DEBUG_THREAD_SLEEP
)
2815 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2816 aq
->agent
->device_id
, aq
->id
);
2817 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2818 if (DEBUG_THREAD_SLEEP
)
2819 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2820 aq
->agent
->device_id
, aq
->id
);
2824 aq
->drain_queue_stop
= 2;
2825 if (DEBUG_THREAD_SIGNAL
)
2826 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2827 aq
->agent
->device_id
, aq
->id
);
2828 pthread_cond_broadcast (&aq
->queue_cond_out
);
2829 pthread_mutex_unlock (&aq
->mutex
);
2831 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2835 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2836 is not usually the case. This is just a debug tool. */
2839 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2841 pthread_mutex_lock (&aq
->mutex
);
2843 while (aq
->queue_n
> 0)
2845 execute_queue_entry (aq
, aq
->queue_first
);
2847 aq
->queue_first
= ((aq
->queue_first
+ 1)
2848 % ASYNC_QUEUE_SIZE
);
2852 pthread_mutex_unlock (&aq
->mutex
);
2855 /* Block the current thread until an async queue is writable. The aq->mutex
2856 lock should be held on entry, and remains locked on exit. */
2859 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2861 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2863 /* Queue is full. Wait for it to not be full. */
2864 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2865 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2869 /* Request an asynchronous kernel launch on the specified queue. This
2870 may block if the queue is full, but returns without waiting for the
2874 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2875 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2877 assert (aq
->agent
== kernel
->agent
);
2879 pthread_mutex_lock (&aq
->mutex
);
2881 wait_for_queue_nonfull (aq
);
2883 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2884 % ASYNC_QUEUE_SIZE
);
2886 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2887 aq
->id
, queue_last
);
2889 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2890 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2891 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2892 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2896 if (DEBUG_THREAD_SIGNAL
)
2897 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2898 aq
->agent
->device_id
, aq
->id
);
2899 pthread_cond_signal (&aq
->queue_cond_in
);
2901 pthread_mutex_unlock (&aq
->mutex
);
2904 /* Request an asynchronous callback on the specified queue. The callback
2905 function will be called, with the given opaque data, from the appropriate
2906 async thread, when all previous items on that queue are complete. */
2909 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2912 pthread_mutex_lock (&aq
->mutex
);
2914 wait_for_queue_nonfull (aq
);
2916 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2917 % ASYNC_QUEUE_SIZE
);
2919 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2920 aq
->id
, queue_last
);
2922 aq
->queue
[queue_last
].type
= CALLBACK
;
2923 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2924 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2928 if (DEBUG_THREAD_SIGNAL
)
2929 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2930 aq
->agent
->device_id
, aq
->id
);
2931 pthread_cond_signal (&aq
->queue_cond_in
);
2933 pthread_mutex_unlock (&aq
->mutex
);
2936 /* Request that a given async thread wait for another thread (unspecified) to
2937 reach the given placeholder. The wait will occur when all previous entries
2938 on the queue are complete. A placeholder is effectively a kind of signal
2939 which simply sets a flag when encountered in a queue. */
2942 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2943 struct placeholder
*placeholderp
)
2945 pthread_mutex_lock (&aq
->mutex
);
2947 wait_for_queue_nonfull (aq
);
2949 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2951 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2952 aq
->id
, queue_last
);
2954 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2955 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2959 if (DEBUG_THREAD_SIGNAL
)
2960 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2961 aq
->agent
->device_id
, aq
->id
);
2962 pthread_cond_signal (&aq
->queue_cond_in
);
2964 pthread_mutex_unlock (&aq
->mutex
);
2967 /* Add a placeholder into an async queue. When the async thread reaches the
2968 placeholder it will set the "executed" flag to true and continue.
2969 Another thread may be waiting on this thread reaching the placeholder. */
2971 static struct placeholder
*
2972 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2974 struct placeholder
*placeholderp
;
2976 pthread_mutex_lock (&aq
->mutex
);
2978 wait_for_queue_nonfull (aq
);
2980 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2982 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2983 aq
->id
, queue_last
);
2985 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2986 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2988 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2990 pthread_mutex_unlock (&aq
->mutex
);
2991 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2994 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2996 pthread_mutex_unlock (&aq
->mutex
);
2997 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3000 placeholderp
->executed
= 0;
3004 if (DEBUG_THREAD_SIGNAL
)
3005 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3006 aq
->agent
->device_id
, aq
->id
);
3007 pthread_cond_signal (&aq
->queue_cond_in
);
3009 pthread_mutex_unlock (&aq
->mutex
);
3011 return placeholderp
;
3014 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
3017 finalize_async_thread (struct goacc_asyncqueue
*aq
)
3019 pthread_mutex_lock (&aq
->mutex
);
3020 if (aq
->drain_queue_stop
== 2)
3022 pthread_mutex_unlock (&aq
->mutex
);
3026 aq
->drain_queue_stop
= 1;
3028 if (DEBUG_THREAD_SIGNAL
)
3029 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3030 aq
->agent
->device_id
, aq
->id
);
3031 pthread_cond_signal (&aq
->queue_cond_in
);
3033 while (aq
->drain_queue_stop
!= 2)
3035 if (DEBUG_THREAD_SLEEP
)
3036 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3037 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
3038 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3039 if (DEBUG_THREAD_SLEEP
)
3040 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3041 aq
->agent
->device_id
, aq
->id
);
3044 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
3046 pthread_mutex_unlock (&aq
->mutex
);
3048 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
3050 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3051 aq
->agent
->device_id
, aq
->id
, strerror (err
));
3052 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
3055 /* Set up an async queue for OpenMP. There will be only one. The
3056 implementation simply uses an OpenACC async queue.
3057 FIXME: is this thread-safe if two threads call this function? */
3060 maybe_init_omp_async (struct agent_info
*agent
)
3062 if (!agent
->omp_async_queue
)
3063 agent
->omp_async_queue
3064 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
3067 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3068 copies from read-only pages. */
3071 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
3073 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
3075 if (status
== HSA_STATUS_SUCCESS
)
3078 /* It appears that the copy fails if the source data is in a read-only page.
3079 We can't detect that easily, so try copying the data to a temporary buffer
3080 and doing the copy again if we got an error above. */
3082 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3083 "[%p:+%d]\n", (void *) src
, (int) len
);
3085 void *src_copy
= malloc (len
);
3086 memcpy (src_copy
, src
, len
);
3087 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
3089 if (status
!= HSA_STATUS_SUCCESS
)
3090 GOMP_PLUGIN_error ("memory copy failed");
3093 /* Copy data to or from a device. This is intended for use as an async
3097 copy_data (void *data_
)
3099 struct copy_data
*data
= (struct copy_data
*)data_
;
3100 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3101 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
3103 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
3107 /* Request an asynchronous data copy, to or from a device, on a given queue.
3108 The event will be registered as a callback. */
3111 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
3115 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3116 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
3117 struct copy_data
*data
3118 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
3123 queue_push_callback (aq
, copy_data
, data
);
3126 /* Return true if the given queue is currently empty. */
3129 queue_empty (struct goacc_asyncqueue
*aq
)
3131 pthread_mutex_lock (&aq
->mutex
);
3132 int res
= aq
->queue_n
== 0 ? 1 : 0;
3133 pthread_mutex_unlock (&aq
->mutex
);
3138 /* Wait for a given queue to become empty. This implements an OpenACC wait
3142 wait_queue (struct goacc_asyncqueue
*aq
)
3144 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
3146 drain_queue_synchronous (aq
);
3150 pthread_mutex_lock (&aq
->mutex
);
3152 while (aq
->queue_n
> 0)
3154 if (DEBUG_THREAD_SLEEP
)
3155 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3156 aq
->agent
->device_id
, aq
->id
);
3157 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3158 if (DEBUG_THREAD_SLEEP
)
3159 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
3163 pthread_mutex_unlock (&aq
->mutex
);
3164 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
3168 /* {{{ OpenACC support */
3170 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3173 gcn_exec (struct kernel_info
*kernel
,
3174 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
3175 struct goacc_asyncqueue
*aq
)
3177 if (!GOMP_OFFLOAD_can_run (kernel
))
3178 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3180 /* If we get here then this must be an OpenACC kernel. */
3181 kernel
->kind
= KIND_OPENACC
;
3183 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3184 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3186 struct hsa_kernel_description
*d
3187 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3188 if (d
->name
== kernel
->name
)
3190 hsa_kernel_desc
= d
;
3195 /* We may have statically-determined dimensions in
3196 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3197 invocation at runtime in dims[]. We allow static dimensions to take
3198 priority over dynamic dimensions when present (non-zero). */
3199 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3200 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3201 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3202 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3203 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3204 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3206 /* Ideally, when a dimension isn't explicitly specified, we should
3207 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3208 In practice, we tune for peak performance on BabelStream, which
3209 for OpenACC is currently 32 threads per CU. */
3210 if (dims
[0] == 0 && dims
[1] == 0)
3212 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3213 number. There isn't really a correct answer for this without a clue
3214 about the problem size, so let's do a reasonable number of workers
3217 dims
[0] = get_cu_count (kernel
->agent
) * 4; /* Gangs. */
3218 dims
[1] = 8; /* Workers. */
3220 else if (dims
[0] == 0 && dims
[1] > 0)
3222 /* Auto-scale the number of gangs with the requested number of workers. */
3223 dims
[0] = get_cu_count (kernel
->agent
) * (32 / dims
[1]);
3225 else if (dims
[0] > 0 && dims
[1] == 0)
3227 /* Auto-scale the number of workers with the requested number of gangs. */
3228 dims
[1] = get_cu_count (kernel
->agent
) * 32 / dims
[0];
3235 /* The incoming dimensions are expressed in terms of gangs, workers, and
3236 vectors. The HSA dimensions are expressed in terms of "work-items",
3237 which means multiples of vector lanes.
3239 The "grid size" specifies the size of the problem space, and the
3240 "work-group size" specifies how much of that we want a single compute
3241 unit to chew on at once.
3243 The three dimensions do not really correspond to hardware, but the
3244 important thing is that the HSA runtime will launch as many
3245 work-groups as it takes to process the entire grid, and each
3246 work-group will contain as many wave-fronts as it takes to process
3247 the work-items in that group.
3249 Essentially, as long as we set the Y dimension to 64 (the number of
3250 vector lanes in hardware), and the Z group size to the maximum (16),
3251 then we will get the gangs (X) and workers (Z) launched as we expect.
3253 The reason for the apparent reversal of vector and worker dimension
3254 order is to do with the way the run-time distributes work-items across
3256 struct GOMP_kernel_launch_attributes kla
=
3259 {dims
[0], 64, dims
[1]},
3260 /* Work-group size. */
3264 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3265 acc_prof_info
*prof_info
= thr
->prof_info
;
3266 acc_event_info enqueue_launch_event_info
;
3267 acc_api_info
*api_info
= thr
->api_info
;
3268 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3269 if (profiling_dispatch_p
)
3271 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3273 enqueue_launch_event_info
.launch_event
.event_type
3274 = prof_info
->event_type
;
3275 enqueue_launch_event_info
.launch_event
.valid_bytes
3276 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3277 enqueue_launch_event_info
.launch_event
.parent_construct
3278 = acc_construct_parallel
;
3279 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3280 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3281 enqueue_launch_event_info
.launch_event
.kernel_name
3282 = (char *) kernel
->name
;
3283 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3284 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3285 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3287 api_info
->device_api
= acc_device_api_other
;
3289 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3290 &enqueue_launch_event_info
, api_info
);
3294 run_kernel (kernel
, devaddrs
, &kla
, NULL
, false);
3296 queue_push_launch (aq
, kernel
, devaddrs
, &kla
);
3298 if (profiling_dispatch_p
)
3300 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3301 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3302 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3303 &enqueue_launch_event_info
,
3309 /* {{{ Generic Plugin API */
3311 /* Return the name of the accelerator, which is "gcn". */
3314 GOMP_OFFLOAD_get_name (void)
3319 /* Return the specific capabilities the HSA accelerator have. */
3322 GOMP_OFFLOAD_get_caps (void)
3324 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3325 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3326 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3329 /* Identify as GCN accelerator. */
3332 GOMP_OFFLOAD_get_type (void)
3334 return OFFLOAD_TARGET_TYPE_GCN
;
3337 /* Return the libgomp version number we're compatible with. There is
3338 no requirement for cross-version compatibility. */
3341 GOMP_OFFLOAD_version (void)
3343 return GOMP_VERSION
;
3346 /* Return the number of GCN devices on the system. */
3349 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
3351 if (!init_hsa_context (true))
3352 exit (EXIT_FAILURE
);
3353 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3354 devices were present. */
3355 if (hsa_context
.agent_count
> 0
3356 && ((omp_requires_mask
3357 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3358 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3359 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
3361 /* Check whether host page access is supported; this is per system level
3362 (all GPUs supported by HSA). While intrinsically true for APUs, it
3363 requires XNACK support for discrete GPUs. */
3364 if (hsa_context
.agent_count
> 0
3365 && (omp_requires_mask
& GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
))
3368 hsa_system_info_t type
= HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT
;
3369 hsa_status_t status
= hsa_fns
.hsa_system_get_info_fn (type
, &b
);
3370 if (status
!= HSA_STATUS_SUCCESS
)
3371 GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3377 return hsa_context
.agent_count
;
3380 /* Initialize device (agent) number N so that it can be used for computation.
3381 Return TRUE on success. */
3384 GOMP_OFFLOAD_init_device (int n
)
3386 if (!init_hsa_context (false))
3388 if (n
>= hsa_context
.agent_count
)
3390 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3393 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3395 if (agent
->initialized
)
3398 agent
->device_id
= n
;
3400 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3402 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3405 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3407 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3410 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3412 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3415 if (pthread_mutex_init (&agent
->ephemeral_memories_write_lock
, NULL
))
3417 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3420 agent
->async_queues
= NULL
;
3421 agent
->omp_async_queue
= NULL
;
3422 agent
->ephemeral_memories_list
= NULL
;
3424 uint32_t queue_size
;
3425 hsa_status_t status
;
3426 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3427 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3429 if (status
!= HSA_STATUS_SUCCESS
)
3430 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3433 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3435 if (status
!= HSA_STATUS_SUCCESS
)
3436 return hsa_error ("Error querying the name of the agent", status
);
3438 agent
->device_isa
= isa_code (agent
->name
);
3439 if (agent
->device_isa
== EF_AMDGPU_MACH_UNSUPPORTED
)
3440 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR
);
3442 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_VENDOR_NAME
,
3443 &agent
->vendor_name
);
3444 if (status
!= HSA_STATUS_SUCCESS
)
3445 return hsa_error ("Error querying the vendor name of the agent", status
);
3447 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3448 HSA_QUEUE_TYPE_MULTI
,
3449 hsa_queue_callback
, NULL
, UINT32_MAX
,
3450 UINT32_MAX
, &agent
->sync_queue
);
3451 if (status
!= HSA_STATUS_SUCCESS
)
3452 return hsa_error ("Error creating command queue", status
);
3454 agent
->kernarg_region
.handle
= (uint64_t) -1;
3455 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3456 get_kernarg_memory_region
,
3457 &agent
->kernarg_region
);
3458 if (status
!= HSA_STATUS_SUCCESS
3459 && status
!= HSA_STATUS_INFO_BREAK
)
3460 hsa_error ("Scanning memory regions failed", status
);
3461 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3463 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3467 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3468 dump_hsa_region (agent
->kernarg_region
, NULL
);
3470 agent
->data_region
.handle
= (uint64_t) -1;
3471 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3472 get_data_memory_region
,
3473 &agent
->data_region
);
3474 if (status
!= HSA_STATUS_SUCCESS
3475 && status
!= HSA_STATUS_INFO_BREAK
)
3476 hsa_error ("Scanning memory regions failed", status
);
3477 if (agent
->data_region
.handle
== (uint64_t) -1)
3479 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3483 GCN_DEBUG ("Selected device data memory region:\n");
3484 dump_hsa_region (agent
->data_region
, NULL
);
3486 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3488 agent
->initialized
= true;
3492 /* Load GCN object-code module described by struct gcn_image_desc in
3493 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3494 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3495 contain the on-device addresses of the functions for reverse offload. To be
3496 freed by the caller. */
3499 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3500 struct addr_pair
**target_table
,
3501 uint64_t **rev_fn_table
,
3502 uint64_t *host_ind_fn_table
)
3504 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3506 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3507 " (expected %u, received %u)",
3508 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3512 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3513 struct agent_info
*agent
;
3514 struct addr_pair
*pair
;
3515 struct module_info
*module
;
3516 struct kernel_info
*kernel
;
3517 int kernel_count
= image_desc
->kernel_count
;
3518 unsigned ind_func_count
= GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version
)
3519 ? image_desc
->ind_func_count
: 0;
3520 unsigned var_count
= image_desc
->global_variable_count
;
3521 /* Currently, "others" is a struct of ICVS. */
3522 int other_count
= 1;
3524 agent
= get_agent_info (ord
);
3528 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3530 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3533 if (agent
->prog_finalized
3534 && !destroy_hsa_program (agent
))
3537 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3538 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count
);
3539 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3540 GCN_DEBUG ("Expect %d other variables in an image\n", other_count
);
3541 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
+ other_count
- 2)
3542 * sizeof (struct addr_pair
));
3543 *target_table
= pair
;
3544 module
= (struct module_info
*)
3545 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3546 + kernel_count
* sizeof (struct kernel_info
));
3547 module
->image_desc
= image_desc
;
3548 module
->kernel_count
= kernel_count
;
3549 module
->heap
= NULL
;
3550 module
->constructors_run_p
= false;
3552 kernel
= &module
->kernels
[0];
3554 /* Allocate memory for kernel dependencies. */
3555 for (unsigned i
= 0; i
< kernel_count
; i
++)
3557 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3558 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3560 if (strcmp (d
->name
, "_init_array") == 0)
3561 module
->init_array_func
= kernel
;
3562 else if (strcmp (d
->name
, "_fini_array") == 0)
3563 module
->fini_array_func
= kernel
;
3566 pair
->start
= (uintptr_t) kernel
;
3567 pair
->end
= (uintptr_t) (kernel
+ 1);
3573 agent
->module
= module
;
3574 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3576 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3580 if (!create_and_finalize_hsa_program (agent
))
3585 hsa_status_t status
;
3586 hsa_executable_symbol_t var_symbol
;
3587 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3588 ".offload_var_table",
3592 if (status
!= HSA_STATUS_SUCCESS
)
3593 hsa_fatal ("Could not find symbol for variable in the code object",
3596 uint64_t var_table_addr
;
3597 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3598 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3600 if (status
!= HSA_STATUS_SUCCESS
)
3601 hsa_fatal ("Could not extract a variable from its symbol", status
);
3606 } var_table
[var_count
];
3607 GOMP_OFFLOAD_dev2host (agent
->device_id
, var_table
,
3608 (void*)var_table_addr
, sizeof (var_table
));
3610 for (unsigned i
= 0; i
< var_count
; i
++)
3612 pair
->start
= var_table
[i
].addr
;
3613 pair
->end
= var_table
[i
].addr
+ var_table
[i
].size
;
3614 GCN_DEBUG ("Found variable at %p with size %lu\n",
3615 (void *)var_table
[i
].addr
, var_table
[i
].size
);
3620 if (ind_func_count
> 0)
3622 hsa_status_t status
;
3624 /* Read indirect function table from image. */
3625 hsa_executable_symbol_t ind_funcs_symbol
;
3626 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3627 ".offload_ind_func_table",
3629 0, &ind_funcs_symbol
);
3631 if (status
!= HSA_STATUS_SUCCESS
)
3632 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3633 "code object", status
);
3635 uint64_t ind_funcs_table_addr
;
3636 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3637 (ind_funcs_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3638 &ind_funcs_table_addr
);
3639 if (status
!= HSA_STATUS_SUCCESS
)
3640 hsa_fatal ("Could not extract a variable from its symbol", status
);
3642 uint64_t ind_funcs_table
[ind_func_count
];
3643 GOMP_OFFLOAD_dev2host (agent
->device_id
, ind_funcs_table
,
3644 (void*) ind_funcs_table_addr
,
3645 sizeof (ind_funcs_table
));
3647 /* Build host->target address map for indirect functions. */
3648 uint64_t ind_fn_map
[ind_func_count
* 2 + 1];
3649 for (unsigned i
= 0; i
< ind_func_count
; i
++)
3651 ind_fn_map
[i
* 2] = host_ind_fn_table
[i
];
3652 ind_fn_map
[i
* 2 + 1] = ind_funcs_table
[i
];
3653 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3654 i
, host_ind_fn_table
[i
], ind_funcs_table
[i
]);
3656 ind_fn_map
[ind_func_count
* 2] = 0;
3658 /* Write the map onto the target. */
3659 void *map_target_addr
3660 = GOMP_OFFLOAD_alloc (agent
->device_id
, sizeof (ind_fn_map
));
3661 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr
);
3663 GOMP_OFFLOAD_host2dev (agent
->device_id
, map_target_addr
,
3665 sizeof (ind_fn_map
));
3667 /* Write address of the map onto the target. */
3668 hsa_executable_symbol_t symbol
;
3671 = hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3672 XSTRING (GOMP_INDIRECT_ADDR_MAP
),
3673 agent
->id
, 0, &symbol
);
3674 if (status
!= HSA_STATUS_SUCCESS
)
3675 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3681 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3682 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3684 if (status
!= HSA_STATUS_SUCCESS
)
3685 hsa_fatal ("Could not extract a variable from its symbol", status
);
3686 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3687 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3689 if (status
!= HSA_STATUS_SUCCESS
)
3690 hsa_fatal ("Could not extract a variable size from its symbol",
3693 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3696 GOMP_OFFLOAD_host2dev (agent
->device_id
, (void *) varptr
,
3698 sizeof (map_target_addr
));
3701 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS
));
3703 hsa_status_t status
;
3704 hsa_executable_symbol_t var_symbol
;
3705 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3706 XSTRING (GOMP_ADDITIONAL_ICVS
),
3707 agent
->id
, 0, &var_symbol
);
3708 if (status
== HSA_STATUS_SUCCESS
)
3713 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3714 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3716 if (status
!= HSA_STATUS_SUCCESS
)
3717 hsa_fatal ("Could not extract a variable from its symbol", status
);
3718 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3719 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3721 if (status
!= HSA_STATUS_SUCCESS
)
3722 hsa_fatal ("Could not extract a variable size from its symbol",
3725 pair
->start
= varptr
;
3726 pair
->end
= varptr
+ varsize
;
3730 /* The variable was not in this image. */
3731 GCN_DEBUG ("Variable not found in image: %s\n",
3732 XSTRING (GOMP_ADDITIONAL_ICVS
));
3733 pair
->start
= pair
->end
= 0;
3736 /* Ensure that constructors are run first. */
3737 struct GOMP_kernel_launch_attributes kla
=
3741 /* Work-group size. */
3745 if (module
->init_array_func
)
3747 init_kernel (module
->init_array_func
);
3748 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3750 module
->constructors_run_p
= true;
3752 /* Don't report kernels that libgomp need not know about. */
3753 if (module
->init_array_func
)
3755 if (module
->fini_array_func
)
3758 if (rev_fn_table
!= NULL
&& kernel_count
== 0)
3759 *rev_fn_table
= NULL
;
3760 else if (rev_fn_table
!= NULL
)
3762 hsa_status_t status
;
3763 hsa_executable_symbol_t var_symbol
;
3764 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3765 ".offload_func_table",
3766 agent
->id
, 0, &var_symbol
);
3767 if (status
!= HSA_STATUS_SUCCESS
)
3768 hsa_fatal ("Could not find symbol for variable in the code object",
3770 uint64_t fn_table_addr
;
3771 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3772 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3774 if (status
!= HSA_STATUS_SUCCESS
)
3775 hsa_fatal ("Could not extract a variable from its symbol", status
);
3776 *rev_fn_table
= GOMP_PLUGIN_malloc (kernel_count
* sizeof (uint64_t));
3777 GOMP_OFFLOAD_dev2host (agent
->device_id
, *rev_fn_table
,
3778 (void*) fn_table_addr
,
3779 kernel_count
* sizeof (uint64_t));
3782 return kernel_count
+ var_count
+ other_count
;
3785 /* Unload GCN object-code module described by struct gcn_image_desc in
3786 TARGET_DATA from agent number N. Return TRUE on success. */
3789 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3791 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3793 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3794 " (expected %u, received %u)",
3795 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3799 struct agent_info
*agent
;
3800 agent
= get_agent_info (n
);
3804 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3806 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3810 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3812 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3817 if (!destroy_module (agent
->module
, true))
3819 free (agent
->module
);
3820 agent
->module
= NULL
;
3821 if (!destroy_hsa_program (agent
))
3823 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3825 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3831 /* Deinitialize all information and status associated with agent number N. We
3832 do not attempt any synchronization, assuming the user and libgomp will not
3833 attempt deinitialization of a device that is in any way being used at the
3834 same time. Return TRUE on success. */
3837 GOMP_OFFLOAD_fini_device (int n
)
3839 struct agent_info
*agent
= get_agent_info (n
);
3843 if (!agent
->initialized
)
3846 if (agent
->omp_async_queue
)
3848 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3849 agent
->omp_async_queue
= NULL
;
3854 if (!destroy_module (agent
->module
, false))
3856 free (agent
->module
);
3857 agent
->module
= NULL
;
3860 if (!destroy_ephemeral_memories (agent
))
3863 if (!destroy_hsa_program (agent
))
3866 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3867 if (status
!= HSA_STATUS_SUCCESS
)
3868 return hsa_error ("Error destroying command queue", status
);
3870 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3872 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3875 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3877 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3881 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3883 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3886 if (pthread_mutex_destroy (&agent
->ephemeral_memories_write_lock
))
3888 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3891 agent
->initialized
= false;
3895 /* Return true if the HSA runtime can run function FN_PTR. */
3898 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3900 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3902 init_kernel (kernel
);
3903 if (kernel
->initialization_failed
)
3904 GOMP_PLUGIN_fatal ("kernel initialization failed");
3909 /* Allocate memory on device N. */
3912 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3914 struct agent_info
*agent
= get_agent_info (n
);
3915 return alloc_by_agent (agent
, size
);
3918 /* Free memory from device N. */
3921 GOMP_OFFLOAD_free (int device
, void *ptr
)
3923 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3925 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3926 if (status
!= HSA_STATUS_SUCCESS
)
3928 hsa_error ("Could not free device memory", status
);
3932 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3933 bool profiling_dispatch_p
3934 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3935 if (profiling_dispatch_p
)
3937 acc_prof_info
*prof_info
= thr
->prof_info
;
3938 acc_event_info data_event_info
;
3939 acc_api_info
*api_info
= thr
->api_info
;
3941 prof_info
->event_type
= acc_ev_free
;
3943 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3944 data_event_info
.data_event
.valid_bytes
3945 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3946 data_event_info
.data_event
.parent_construct
3947 = acc_construct_parallel
;
3948 data_event_info
.data_event
.implicit
= 1;
3949 data_event_info
.data_event
.tool_info
= NULL
;
3950 data_event_info
.data_event
.var_name
= NULL
;
3951 data_event_info
.data_event
.bytes
= 0;
3952 data_event_info
.data_event
.host_ptr
= NULL
;
3953 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3955 api_info
->device_api
= acc_device_api_other
;
3957 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3964 /* Copy data from DEVICE to host. */
3967 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3969 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3971 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3972 if (status
!= HSA_STATUS_SUCCESS
)
3973 GOMP_PLUGIN_error ("memory copy failed");
3977 /* Copy data from host to DEVICE. */
3980 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3982 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3984 hsa_memory_copy_wrapper (dst
, src
, n
);
3988 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3991 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3993 struct gcn_thread
*thread_data
= gcn_thread ();
3995 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3997 struct agent_info
*agent
= get_agent_info (device
);
3998 maybe_init_omp_async (agent
);
3999 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
);
4003 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
4004 device
, src
, device
, dst
);
4005 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
4006 if (status
!= HSA_STATUS_SUCCESS
)
4007 GOMP_PLUGIN_error ("memory copy failed");
4011 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
4012 measured in bytes. So we have:
4014 dim1_size: number of bytes to copy on innermost dimension ("row")
4015 dim0_len: number of rows to copy
4016 dst: base pointer for destination of copy
4017 dst_offset1_size: innermost row offset (for dest), in bytes
4018 dst_offset0_len: offset, number of rows (for dest)
4019 dst_dim1_size: whole-array dest row length, in bytes (pitch)
4020 src: base pointer for source of copy
4021 src_offset1_size: innermost row offset (for source), in bytes
4022 src_offset0_len: offset, number of rows (for source)
4023 src_dim1_size: whole-array source row length, in bytes (pitch)
4027 GOMP_OFFLOAD_memcpy2d (int dst_ord
, int src_ord
, size_t dim1_size
,
4028 size_t dim0_len
, void *dst
, size_t dst_offset1_size
,
4029 size_t dst_offset0_len
, size_t dst_dim1_size
,
4030 const void *src
, size_t src_offset1_size
,
4031 size_t src_offset0_len
, size_t src_dim1_size
)
4033 if (!hsa_fns
.hsa_amd_memory_lock_fn
4034 || !hsa_fns
.hsa_amd_memory_unlock_fn
4035 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
4038 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4039 out quietly if we have anything oddly-aligned rather than letting the
4040 driver raise an error. */
4041 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
4044 if ((dst_dim1_size
& 3) != 0 || (src_dim1_size
& 3) != 0)
4047 /* Only handle host to device or device to host transfers here. */
4048 if ((dst_ord
== -1 && src_ord
== -1)
4049 || (dst_ord
!= -1 && src_ord
!= -1))
4052 hsa_amd_copy_direction_t dir
4053 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
4054 hsa_agent_t copy_agent
;
4056 /* We need to pin (lock) host memory before we start the transfer. Try to
4057 lock the minimum size necessary, i.e. using partial first/last rows of the
4058 whole array. Something like this:
4062 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4063 o | ++#######+++++ <- whole row
4064 l | ++#######+++++ <- "
4065 s v ++#######..... <- last row apart from trailing remainder
4068 We could split very large transfers into several rectangular copies, but
4069 that is unimplemented for now. */
4071 size_t bounded_size_host
, first_elem_offset_host
;
4073 if (dir
== hsaHostToDevice
)
4075 bounded_size_host
= src_dim1_size
* (dim0_len
- 1) + dim1_size
;
4076 first_elem_offset_host
= src_offset0_len
* src_dim1_size
4078 host_ptr
= (void *) src
;
4079 struct agent_info
*agent
= get_agent_info (dst_ord
);
4080 copy_agent
= agent
->id
;
4084 bounded_size_host
= dst_dim1_size
* (dim0_len
- 1) + dim1_size
;
4085 first_elem_offset_host
= dst_offset0_len
* dst_dim1_size
4088 struct agent_info
*agent
= get_agent_info (src_ord
);
4089 copy_agent
= agent
->id
;
4095 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4096 bounded_size_host
, NULL
, 0, &agent_ptr
);
4097 /* We can't lock the host memory: don't give up though, we might still be
4098 able to use the slow path in our caller. So, don't make this an
4100 if (status
!= HSA_STATUS_SUCCESS
)
4103 hsa_pitched_ptr_t dstpp
, srcpp
;
4104 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4108 hsa_signal_t completion_signal
;
4109 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4110 if (status
!= HSA_STATUS_SUCCESS
)
4116 if (dir
== hsaHostToDevice
)
4118 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4123 srcpp
.base
= (void *) src
;
4124 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4127 srcpp
.pitch
= src_dim1_size
;
4130 src_offsets
.x
= src_offset1_size
;
4131 src_offsets
.y
= src_offset0_len
;
4134 dstpp
.pitch
= dst_dim1_size
;
4137 dst_offsets
.x
= dst_offset1_size
;
4138 dst_offsets
.y
= dst_offset0_len
;
4141 ranges
.x
= dim1_size
;
4142 ranges
.y
= dim0_len
;
4146 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4147 &src_offsets
, &ranges
,
4148 copy_agent
, dir
, 0, NULL
,
4150 /* If the rectangular copy fails, we might still be able to use the slow
4151 path. We need to unlock the host memory though, so don't return
4153 if (status
!= HSA_STATUS_SUCCESS
)
4156 hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4157 HSA_SIGNAL_CONDITION_LT
, 1, UINT64_MAX
,
4158 HSA_WAIT_STATE_ACTIVE
);
4160 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4163 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4164 if (status
!= HSA_STATUS_SUCCESS
)
4165 hsa_fatal ("Could not unlock host memory", status
);
4170 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4171 measured in bytes. So we have:
4173 dim2_size: number of bytes to copy on innermost dimension ("row")
4174 dim1_len: number of rows per slice to copy
4175 dim0_len: number of slices to copy
4176 dst: base pointer for destination of copy
4177 dst_offset2_size: innermost row offset (for dest), in bytes
4178 dst_offset1_len: offset, number of rows (for dest)
4179 dst_offset0_len: offset, number of slices (for dest)
4180 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4181 dst_dim1_len: whole-array number of rows in slice (for dest)
4182 src: base pointer for source of copy
4183 src_offset2_size: innermost row offset (for source), in bytes
4184 src_offset1_len: offset, number of rows (for source)
4185 src_offset0_len: offset, number of slices (for source)
4186 src_dim2_size: whole-array source row length, in bytes (pitch)
4187 src_dim1_len: whole-array number of rows in slice (for source)
4191 GOMP_OFFLOAD_memcpy3d (int dst_ord
, int src_ord
, size_t dim2_size
,
4192 size_t dim1_len
, size_t dim0_len
, void *dst
,
4193 size_t dst_offset2_size
, size_t dst_offset1_len
,
4194 size_t dst_offset0_len
, size_t dst_dim2_size
,
4195 size_t dst_dim1_len
, const void *src
,
4196 size_t src_offset2_size
, size_t src_offset1_len
,
4197 size_t src_offset0_len
, size_t src_dim2_size
,
4198 size_t src_dim1_len
)
4200 if (!hsa_fns
.hsa_amd_memory_lock_fn
4201 || !hsa_fns
.hsa_amd_memory_unlock_fn
4202 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
4205 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4206 out quietly if we have anything oddly-aligned rather than letting the
4207 driver raise an error. */
4208 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
4211 if ((dst_dim2_size
& 3) != 0 || (src_dim2_size
& 3) != 0)
4214 /* Only handle host to device or device to host transfers here. */
4215 if ((dst_ord
== -1 && src_ord
== -1)
4216 || (dst_ord
!= -1 && src_ord
!= -1))
4219 hsa_amd_copy_direction_t dir
4220 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
4221 hsa_agent_t copy_agent
;
4223 /* We need to pin (lock) host memory before we start the transfer. Try to
4224 lock the minimum size necessary, i.e. using partial first/last slices of
4225 the whole 3D array. Something like this:
4227 slice 0: slice 1: slice 2:
4228 __________ __________ __________
4229 ^ /+++++++++/ : /+++++++++/ : / /
4230 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4231 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4232 /_________/ : /_________/ : /_________/
4235 We could split very large transfers into several rectangular copies, but
4236 that is unimplemented for now. */
4238 size_t bounded_size_host
, first_elem_offset_host
;
4240 if (dir
== hsaHostToDevice
)
4242 size_t slice_bytes
= src_dim2_size
* src_dim1_len
;
4243 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4244 + src_dim2_size
* (dim1_len
- 1)
4246 first_elem_offset_host
= src_offset0_len
* slice_bytes
4247 + src_offset1_len
* src_dim2_size
4249 host_ptr
= (void *) src
;
4250 struct agent_info
*agent
= get_agent_info (dst_ord
);
4251 copy_agent
= agent
->id
;
4255 size_t slice_bytes
= dst_dim2_size
* dst_dim1_len
;
4256 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4257 + dst_dim2_size
* (dim1_len
- 1)
4259 first_elem_offset_host
= dst_offset0_len
* slice_bytes
4260 + dst_offset1_len
* dst_dim2_size
4263 struct agent_info
*agent
= get_agent_info (src_ord
);
4264 copy_agent
= agent
->id
;
4270 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4271 bounded_size_host
, NULL
, 0, &agent_ptr
);
4272 /* We can't lock the host memory: don't give up though, we might still be
4273 able to use the slow path in our caller (maybe even with iterated memcpy2d
4274 calls). So, don't make this an error. */
4275 if (status
!= HSA_STATUS_SUCCESS
)
4278 hsa_pitched_ptr_t dstpp
, srcpp
;
4279 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4283 hsa_signal_t completion_signal
;
4284 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4285 if (status
!= HSA_STATUS_SUCCESS
)
4291 if (dir
== hsaHostToDevice
)
4293 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4298 srcpp
.base
= (void *) src
;
4299 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4302 /* Pitch is measured in bytes. */
4303 srcpp
.pitch
= src_dim2_size
;
4304 /* Slice is also measured in bytes (i.e. total per-slice). */
4305 srcpp
.slice
= src_dim2_size
* src_dim1_len
;
4307 src_offsets
.x
= src_offset2_size
;
4308 src_offsets
.y
= src_offset1_len
;
4309 src_offsets
.z
= src_offset0_len
;
4312 dstpp
.pitch
= dst_dim2_size
;
4313 dstpp
.slice
= dst_dim2_size
* dst_dim1_len
;
4315 dst_offsets
.x
= dst_offset2_size
;
4316 dst_offsets
.y
= dst_offset1_len
;
4317 dst_offsets
.z
= dst_offset0_len
;
4319 ranges
.x
= dim2_size
;
4320 ranges
.y
= dim1_len
;
4321 ranges
.z
= dim0_len
;
4324 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4325 &src_offsets
, &ranges
,
4326 copy_agent
, dir
, 0, NULL
,
4328 /* If the rectangular copy fails, we might still be able to use the slow
4329 path. We need to unlock the host memory though, so don't return
4331 if (status
!= HSA_STATUS_SUCCESS
)
4335 hsa_signal_value_t sv
4336 = hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4337 HSA_SIGNAL_CONDITION_LT
, 1,
4339 HSA_WAIT_STATE_ACTIVE
);
4342 GCN_WARNING ("async copy rect failure");
4347 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4350 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4351 if (status
!= HSA_STATUS_SUCCESS
)
4352 hsa_fatal ("Could not unlock host memory", status
);
4358 /* {{{ OpenMP Plugin API */
4360 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4361 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4362 to a kernel_info structure, and must have previously been loaded to the
4363 specified device. */
4366 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
4368 struct agent_info
*agent
= get_agent_info (device
);
4369 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4370 struct GOMP_kernel_launch_attributes def
;
4371 struct GOMP_kernel_launch_attributes
*kla
;
4372 assert (agent
== kernel
->agent
);
4374 /* If we get here then the kernel must be OpenMP. */
4375 kernel
->kind
= KIND_OPENMP
;
4377 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4379 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4382 run_kernel (kernel
, vars
, kla
, NULL
, false);
4385 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4386 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4387 GOMP_PLUGIN_target_task_completion when it has finished. */
4390 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
4391 void **args
, void *async_data
)
4393 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4394 struct agent_info
*agent
= get_agent_info (device
);
4395 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
4396 struct GOMP_kernel_launch_attributes def
;
4397 struct GOMP_kernel_launch_attributes
*kla
;
4398 assert (agent
== kernel
->agent
);
4400 /* If we get here then the kernel must be OpenMP. */
4401 kernel
->kind
= KIND_OPENMP
;
4403 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4405 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4409 maybe_init_omp_async (agent
);
4410 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
4411 queue_push_callback (agent
->omp_async_queue
,
4412 GOMP_PLUGIN_target_task_completion
, async_data
);
4416 /* {{{ OpenACC Plugin API */
4418 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4419 already-loaded KERNEL. */
4422 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *),
4423 size_t mapnum
__attribute__((unused
)),
4424 void **hostaddrs
__attribute__((unused
)),
4425 void **devaddrs
, unsigned *dims
,
4426 void *targ_mem_desc
)
4428 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4430 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, false, NULL
);
4433 /* Run an asynchronous OpenACC kernel on the specified queue. */
4436 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *),
4437 size_t mapnum
__attribute__((unused
)),
4438 void **hostaddrs
__attribute__((unused
)),
4440 unsigned *dims
, void *targ_mem_desc
,
4441 struct goacc_asyncqueue
*aq
)
4443 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4445 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, true, aq
);
4448 /* Create a new asynchronous thread and queue for running future kernels. */
4450 struct goacc_asyncqueue
*
4451 GOMP_OFFLOAD_openacc_async_construct (int device
)
4453 struct agent_info
*agent
= get_agent_info (device
);
4455 pthread_mutex_lock (&agent
->async_queues_mutex
);
4457 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
4458 aq
->agent
= get_agent_info (device
);
4460 aq
->next
= agent
->async_queues
;
4463 aq
->next
->prev
= aq
;
4464 aq
->id
= aq
->next
->id
+ 1;
4468 agent
->async_queues
= aq
;
4470 aq
->queue_first
= 0;
4472 aq
->drain_queue_stop
= 0;
4474 if (pthread_mutex_init (&aq
->mutex
, NULL
))
4476 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4479 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
4481 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4484 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
4486 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4490 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
4492 HSA_QUEUE_TYPE_MULTI
,
4493 hsa_queue_callback
, NULL
,
4494 UINT32_MAX
, UINT32_MAX
,
4496 if (status
!= HSA_STATUS_SUCCESS
)
4497 hsa_fatal ("Error creating command queue", status
);
4499 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
4501 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4503 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
4506 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4511 /* Destroy an existing asynchronous thread and queue. Waits for any
4512 currently-running task to complete, but cancels any queued tasks. */
4515 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
4517 struct agent_info
*agent
= aq
->agent
;
4519 finalize_async_thread (aq
);
4521 pthread_mutex_lock (&agent
->async_queues_mutex
);
4524 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
4526 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
4529 if (pthread_cond_destroy (&aq
->queue_cond_in
))
4531 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4534 if (pthread_cond_destroy (&aq
->queue_cond_out
))
4536 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4539 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
4540 if (status
!= HSA_STATUS_SUCCESS
)
4542 hsa_error ("Error destroying command queue", status
);
4547 aq
->prev
->next
= aq
->next
;
4549 aq
->next
->prev
= aq
->prev
;
4550 if (agent
->async_queues
== aq
)
4551 agent
->async_queues
= aq
->next
;
4553 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
4556 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4560 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4564 /* Return true if the specified async queue is currently empty. */
4567 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
4569 return queue_empty (aq
);
4572 /* Block until the specified queue has executed all its tasks and the
4576 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
4582 /* Add a serialization point across two async queues. Any new tasks added to
4583 AQ2, after this call, will not run until all tasks on AQ1, at the time
4584 of this call, have completed. */
4587 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
4588 struct goacc_asyncqueue
*aq2
)
4590 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4591 scheduled to run on it up to this point. */
4594 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
4595 queue_push_asyncwait (aq2
, placeholderp
);
4600 /* Add an opaque callback to the given async queue. */
4603 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
4604 void (*fn
) (void *), void *data
)
4606 queue_push_callback (aq
, fn
, data
);
4609 /* Queue up an asynchronous data copy from host to DEVICE. */
4612 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
4613 size_t n
, struct goacc_asyncqueue
*aq
)
4615 struct agent_info
*agent
= get_agent_info (device
);
4616 assert (agent
== aq
->agent
);
4617 queue_push_copy (aq
, dst
, src
, n
);
4621 /* Queue up an asynchronous data copy from DEVICE to host. */
4624 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
4625 size_t n
, struct goacc_asyncqueue
*aq
)
4627 struct agent_info
*agent
= get_agent_info (device
);
4628 assert (agent
== aq
->agent
);
4629 queue_push_copy (aq
, dst
, src
, n
);
4633 union goacc_property_value
4634 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
4636 struct agent_info
*agent
= get_agent_info (device
);
4638 union goacc_property_value propval
= { .val
= 0 };
4642 case GOACC_PROPERTY_FREE_MEMORY
:
4643 /* Not supported. */
4645 case GOACC_PROPERTY_MEMORY
:
4648 hsa_region_t region
= agent
->data_region
;
4649 hsa_status_t status
=
4650 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
4651 if (status
== HSA_STATUS_SUCCESS
)
4655 case GOACC_PROPERTY_NAME
:
4656 propval
.ptr
= agent
->name
;
4658 case GOACC_PROPERTY_VENDOR
:
4659 propval
.ptr
= agent
->vendor_name
;
4661 case GOACC_PROPERTY_DRIVER
:
4662 propval
.ptr
= hsa_context
.driver_version_s
;
4669 /* Set up plugin-specific thread-local-data (host-side). */
4672 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
4674 struct gcn_thread
*thread_data
4675 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
4677 thread_data
->async
= GOMP_ASYNC_SYNC
;
4679 return (void *) thread_data
;
4682 /* Clean up plugin-specific thread-local-data. */
4685 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)