Daily bump.
[official-gcc.git] / libgomp / plugin / plugin-gcn.c
blob3d882b5ab631b0bbe544353a2359332d3ddd096f
1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87 #include <unistd.h>
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
92 static char *
93 secure_getenv (const char *name)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
106 /* }}} */
107 /* {{{ Types */
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111 struct gcn_thread
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
224 /* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
227 struct kernel_dispatch
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
249 struct kernargs {
250 struct kernargs_abi abi;
252 /* Output data. */
253 struct output output_data;
256 /* A queue entry for a future asynchronous launch. */
258 struct kernel_launch
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
265 /* A queue entry for a future callback. */
267 struct callback
269 void (*fn)(void *);
270 void *data;
273 /* A data struct for the copy_data callback. */
275 struct copy_data
277 void *dst;
278 const void *src;
279 size_t len;
280 struct goacc_asyncqueue *aq;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
285 struct placeholder
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder *placeholderp;
299 /* Encode the type of an entry in an async queue. */
301 enum entry_type
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
309 /* An entry in an async queue. */
311 struct queue_entry
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
355 int sgpr_count;
356 int vpgr_count;
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
363 const char *name;
364 void *address;
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
378 const unsigned ind_func_count;
379 const unsigned global_variable_count;
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
386 typedef enum {
387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
393 EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032,
394 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
395 EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
396 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
397 EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
398 } EF_AMDGPU_MACH;
400 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
401 typedef EF_AMDGPU_MACH gcn_isa;
403 /* Description of an HSA GPU agent (device) and the program associated with
404 it. */
406 struct agent_info
408 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
409 hsa_agent_t id;
410 /* The user-visible device number. */
411 int device_id;
412 /* Whether the agent has been initialized. The fields below are usable only
413 if it has been. */
414 bool initialized;
416 /* The instruction set architecture of the device. */
417 gcn_isa device_isa;
418 /* Name of the agent. */
419 char name[64];
420 /* Name of the vendor of the agent. */
421 char vendor_name[64];
422 /* Command queues of the agent. */
423 hsa_queue_t *sync_queue;
424 struct goacc_asyncqueue *async_queues, *omp_async_queue;
425 pthread_mutex_t async_queues_mutex;
427 /* The HSA memory region from which to allocate kernel arguments. */
428 hsa_region_t kernarg_region;
430 /* The HSA memory region from which to allocate device data. */
431 hsa_region_t data_region;
433 /* Allocated ephemeral memories (team arena and stack space). */
434 struct ephemeral_memories_list *ephemeral_memories_list;
435 pthread_mutex_t ephemeral_memories_write_lock;
437 /* Read-write lock that protects kernels which are running or about to be run
438 from interference with loading and unloading of images. Needs to be
439 locked for reading while a kernel is being run, and for writing if the
440 list of modules is manipulated (and thus the HSA program invalidated). */
441 pthread_rwlock_t module_rwlock;
443 /* The module associated with this kernel. */
444 struct module_info *module;
446 /* Mutex enforcing that only one thread will finalize the HSA program. A
447 thread should have locked agent->module_rwlock for reading before
448 acquiring it. */
449 pthread_mutex_t prog_mutex;
450 /* Flag whether the HSA program that consists of all the modules has been
451 finalized. */
452 bool prog_finalized;
453 /* HSA executable - the finalized program that is used to locate kernels. */
454 hsa_executable_t executable;
457 /* Information required to identify, finalize and run any given kernel. */
459 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
461 struct kernel_info
463 /* Name of the kernel, required to locate it within the GCN object-code
464 module. */
465 const char *name;
466 /* The specific agent the kernel has been or will be finalized for and run
467 on. */
468 struct agent_info *agent;
469 /* The specific module where the kernel takes place. */
470 struct module_info *module;
471 /* Information provided by mkoffload associated with the kernel. */
472 struct hsa_kernel_description *description;
473 /* Mutex enforcing that at most once thread ever initializes a kernel for
474 use. A thread should have locked agent->module_rwlock for reading before
475 acquiring it. */
476 pthread_mutex_t init_mutex;
477 /* Flag indicating whether the kernel has been initialized and all fields
478 below it contain valid data. */
479 bool initialized;
480 /* Flag indicating that the kernel has a problem that blocks an execution. */
481 bool initialization_failed;
482 /* The object to be put into the dispatch queue. */
483 uint64_t object;
484 /* Required size of kernel arguments. */
485 uint32_t kernarg_segment_size;
486 /* Required size of group segment. */
487 uint32_t group_segment_size;
488 /* Required size of private segment. */
489 uint32_t private_segment_size;
490 /* Set up for OpenMP or OpenACC? */
491 enum offload_kind kind;
494 /* Information about a particular GCN module, its image and kernels. */
496 struct module_info
498 /* The description with which the program has registered the image. */
499 struct gcn_image_desc *image_desc;
500 /* GCN heap allocation. */
501 struct heap *heap;
502 /* Physical boundaries of the loaded module. */
503 Elf64_Addr phys_address_start;
504 Elf64_Addr phys_address_end;
506 bool constructors_run_p;
507 struct kernel_info *init_array_func, *fini_array_func;
509 /* Number of kernels in this module. */
510 int kernel_count;
511 /* An array of kernel_info structures describing each kernel in this
512 module. */
513 struct kernel_info kernels[];
516 /* A linked list of memory arenas allocated on the device.
517 These are used by OpenMP, as a means to optimize per-team malloc,
518 and for host-accessible stack space. */
520 struct ephemeral_memories_list
522 struct ephemeral_memories_list *next;
524 /* The size is determined by the number of teams and threads. */
525 size_t size;
526 /* The device address allocated memory. */
527 void *address;
528 /* A flag to prevent two asynchronous kernels trying to use the same memory.
529 The mutex is locked until the kernel exits. */
530 pthread_mutex_t in_use;
533 /* Information about the whole HSA environment and all of its agents. */
535 struct hsa_context_info
537 /* Whether the structure has been initialized. */
538 bool initialized;
539 /* Number of usable GPU HSA agents in the system. */
540 int agent_count;
541 /* Array of agent_info structures describing the individual HSA agents. */
542 struct agent_info *agents;
543 /* Driver version string. */
544 char driver_version_s[30];
547 /* }}} */
548 /* {{{ Global variables */
550 /* Information about the whole HSA environment and all of its agents. */
552 static struct hsa_context_info hsa_context;
554 /* HSA runtime functions that are initialized in init_hsa_context. */
556 static struct hsa_runtime_fn_info hsa_fns;
558 /* Heap space, allocated target-side, provided for use of newlib malloc.
559 Each module should have it's own heap allocated.
560 Beware that heap usage increases with OpenMP teams. See also arenas. */
562 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
564 /* Ephemeral memory sizes for each kernel launch. */
566 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
567 static int stack_size = DEFAULT_GCN_STACK_SIZE;
568 static int lowlat_size = -1;
570 /* Flag to decide whether print to stderr information about what is going on.
571 Set in init_debug depending on environment variables. */
573 static bool debug;
575 /* Flag to decide if the runtime should suppress a possible fallback to host
576 execution. */
578 static bool suppress_host_fallback;
580 /* Flag to locate HSA runtime shared library that is dlopened
581 by this plug-in. */
583 static const char *hsa_runtime_lib;
585 /* Flag to decide if the runtime should support also CPU devices (can be
586 a simulator). */
588 static bool support_cpu_devices;
590 /* Runtime dimension overrides. Zero indicates default. */
592 static int override_x_dim = 0;
593 static int override_z_dim = 0;
595 /* }}} */
596 /* {{{ Debug & Diagnostic */
598 /* Print a message to stderr if GCN_DEBUG value is set to true. */
600 #define DEBUG_PRINT(...) \
601 do \
603 if (debug) \
605 fprintf (stderr, __VA_ARGS__); \
608 while (false);
610 /* Flush stderr if GCN_DEBUG value is set to true. */
612 #define DEBUG_FLUSH() \
613 do { \
614 if (debug) \
615 fflush (stderr); \
616 } while (false)
618 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
619 is set to true. */
621 #define DEBUG_LOG(prefix, ...) \
622 do \
624 DEBUG_PRINT (prefix); \
625 DEBUG_PRINT (__VA_ARGS__); \
626 DEBUG_FLUSH (); \
627 } while (false)
629 /* Print a debugging message to stderr. */
631 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
633 /* Print a warning message to stderr. */
635 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
637 /* Print HSA warning STR with an HSA STATUS code. */
639 static void
640 hsa_warn (const char *str, hsa_status_t status)
642 if (!debug)
643 return;
645 const char *hsa_error_msg = "[unknown]";
646 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
648 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
649 hsa_error_msg);
652 /* Report a fatal error STR together with the HSA error corresponding to STATUS
653 and terminate execution of the current process. */
655 static void
656 hsa_fatal (const char *str, hsa_status_t status)
658 const char *hsa_error_msg = "[unknown]";
659 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
661 hsa_error_msg);
664 /* Like hsa_fatal, except only report error message, and return FALSE
665 for propagating error processing to outside of plugin. */
667 static bool
668 hsa_error (const char *str, hsa_status_t status)
670 const char *hsa_error_msg = "[unknown]";
671 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
672 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
673 hsa_error_msg);
674 return false;
677 /* Dump information about the available hardware. */
679 static void
680 dump_hsa_system_info (void)
682 hsa_status_t status;
684 hsa_endianness_t endianness;
685 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
686 &endianness);
687 if (status == HSA_STATUS_SUCCESS)
688 switch (endianness)
690 case HSA_ENDIANNESS_LITTLE:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
692 break;
693 case HSA_ENDIANNESS_BIG:
694 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
695 break;
696 default:
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
699 else
700 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
702 uint8_t extensions[128];
703 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
704 &extensions);
705 if (status == HSA_STATUS_SUCCESS)
707 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
708 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
710 else
711 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
714 /* Dump information about the available hardware. */
716 static void
717 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
719 switch (machine_model)
721 case HSA_MACHINE_MODEL_SMALL:
722 GCN_DEBUG ("%s: SMALL\n", s);
723 break;
724 case HSA_MACHINE_MODEL_LARGE:
725 GCN_DEBUG ("%s: LARGE\n", s);
726 break;
727 default:
728 GCN_WARNING ("%s: UNKNOWN\n", s);
729 break;
733 /* Dump information about the available hardware. */
735 static void
736 dump_profile (hsa_profile_t profile, const char *s)
738 switch (profile)
740 case HSA_PROFILE_FULL:
741 GCN_DEBUG ("%s: FULL\n", s);
742 break;
743 case HSA_PROFILE_BASE:
744 GCN_DEBUG ("%s: BASE\n", s);
745 break;
746 default:
747 GCN_WARNING ("%s: UNKNOWN\n", s);
748 break;
752 /* Dump information about a device memory region. */
754 static hsa_status_t
755 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
757 hsa_status_t status;
759 hsa_region_segment_t segment;
760 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
761 &segment);
762 if (status == HSA_STATUS_SUCCESS)
764 if (segment == HSA_REGION_SEGMENT_GLOBAL)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
766 else if (segment == HSA_REGION_SEGMENT_READONLY)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
768 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
769 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
770 else if (segment == HSA_REGION_SEGMENT_GROUP)
771 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
772 else
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
775 else
776 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
778 if (segment == HSA_REGION_SEGMENT_GLOBAL)
780 uint32_t flags;
781 status
782 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
783 &flags);
784 if (status == HSA_STATUS_SUCCESS)
786 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
788 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
789 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
790 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
791 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
793 else
794 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
797 size_t size;
798 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
799 if (status == HSA_STATUS_SUCCESS)
800 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
801 else
802 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
804 status
805 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
806 &size);
807 if (status == HSA_STATUS_SUCCESS)
808 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
809 else
810 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
812 bool alloc_allowed;
813 status
814 = hsa_fns.hsa_region_get_info_fn (region,
815 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
816 &alloc_allowed);
817 if (status == HSA_STATUS_SUCCESS)
818 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
819 else
820 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
822 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
823 return HSA_STATUS_SUCCESS;
825 status
826 = hsa_fns.hsa_region_get_info_fn (region,
827 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
828 &size);
829 if (status == HSA_STATUS_SUCCESS)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
831 else
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
834 size_t align;
835 status
836 = hsa_fns.hsa_region_get_info_fn (region,
837 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
838 &align);
839 if (status == HSA_STATUS_SUCCESS)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
841 else
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
844 return HSA_STATUS_SUCCESS;
847 /* Dump information about all the device memory regions. */
849 static void
850 dump_hsa_regions (hsa_agent_t agent)
852 hsa_status_t status;
853 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
854 dump_hsa_region,
855 NULL);
856 if (status != HSA_STATUS_SUCCESS)
857 hsa_error ("Dumping hsa regions failed", status);
860 /* Dump information about the available devices. */
862 static hsa_status_t
863 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
865 hsa_status_t status;
867 char buf[64];
868 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
869 &buf);
870 if (status == HSA_STATUS_SUCCESS)
871 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
872 else
873 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
875 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
876 &buf);
877 if (status == HSA_STATUS_SUCCESS)
878 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
879 else
880 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
882 hsa_machine_model_t machine_model;
883 status
884 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
885 &machine_model);
886 if (status == HSA_STATUS_SUCCESS)
887 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
888 else
889 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
891 hsa_profile_t profile;
892 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
893 &profile);
894 if (status == HSA_STATUS_SUCCESS)
895 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
896 else
897 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
899 hsa_device_type_t device_type;
900 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
901 &device_type);
902 if (status == HSA_STATUS_SUCCESS)
904 switch (device_type)
906 case HSA_DEVICE_TYPE_CPU:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
908 break;
909 case HSA_DEVICE_TYPE_GPU:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
911 break;
912 case HSA_DEVICE_TYPE_DSP:
913 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
914 break;
915 default:
916 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
917 break;
920 else
921 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
923 uint32_t cu_count;
924 status = hsa_fns.hsa_agent_get_info_fn
925 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
926 if (status == HSA_STATUS_SUCCESS)
927 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
928 else
929 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
931 uint32_t size;
932 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
933 &size);
934 if (status == HSA_STATUS_SUCCESS)
935 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
936 else
937 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
939 uint32_t max_dim;
940 status = hsa_fns.hsa_agent_get_info_fn (agent,
941 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
942 &max_dim);
943 if (status == HSA_STATUS_SUCCESS)
944 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
945 else
946 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
948 uint32_t max_size;
949 status = hsa_fns.hsa_agent_get_info_fn (agent,
950 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
951 &max_size);
952 if (status == HSA_STATUS_SUCCESS)
953 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
954 else
955 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
957 uint32_t grid_max_dim;
958 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
959 &grid_max_dim);
960 if (status == HSA_STATUS_SUCCESS)
961 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
962 else
963 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
965 uint32_t grid_max_size;
966 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
967 &grid_max_size);
968 if (status == HSA_STATUS_SUCCESS)
969 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
970 else
971 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
973 dump_hsa_regions (agent);
975 return HSA_STATUS_SUCCESS;
978 /* Forward reference. */
980 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
982 /* Helper function for dump_executable_symbols. */
984 static hsa_status_t
985 dump_executable_symbol (hsa_executable_t executable,
986 hsa_executable_symbol_t symbol,
987 void *data __attribute__((unused)))
989 char *name = get_executable_symbol_name (symbol);
991 if (name)
993 GCN_DEBUG ("executable symbol: %s\n", name);
994 free (name);
997 return HSA_STATUS_SUCCESS;
1000 /* Dump all global symbol in an executable. */
1002 static void
1003 dump_executable_symbols (hsa_executable_t executable)
1005 hsa_status_t status;
1006 status
1007 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1008 dump_executable_symbol,
1009 NULL);
1010 if (status != HSA_STATUS_SUCCESS)
1011 hsa_fatal ("Could not dump HSA executable symbols", status);
1014 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1016 static void
1017 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1019 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1021 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1022 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1023 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1024 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1025 (void*)kernargs->abi.heap_ptr);
1026 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1027 "", (void*)kernargs->abi.arena_ptr,
1028 kernargs->abi.arena_size_per_team);
1029 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1030 "", (void*)kernargs->abi.stack_ptr,
1031 kernargs->abi.stack_size_per_thread);
1032 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1033 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1034 dispatch->private_segment_size);
1035 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1036 "", dispatch->group_segment_size);
1037 fprintf (stderr, "\n");
1040 /* }}} */
1041 /* {{{ Utility functions */
1043 /* Cast the thread local storage to gcn_thread. */
1045 static inline struct gcn_thread *
1046 gcn_thread (void)
1048 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1051 /* Initialize debug and suppress_host_fallback according to the environment. */
1053 static void
1054 init_environment_variables (void)
1056 if (secure_getenv ("GCN_DEBUG"))
1057 debug = true;
1058 else
1059 debug = false;
1061 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1062 suppress_host_fallback = true;
1063 else
1064 suppress_host_fallback = false;
1066 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1067 if (hsa_runtime_lib == NULL)
1068 hsa_runtime_lib = "libhsa-runtime64.so.1";
1070 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1072 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1073 if (!x)
1074 x = secure_getenv ("GCN_NUM_GANGS");
1075 if (x)
1076 override_x_dim = atoi (x);
1078 const char *z = secure_getenv ("GCN_NUM_THREADS");
1079 if (!z)
1080 z = secure_getenv ("GCN_NUM_WORKERS");
1081 if (z)
1082 override_z_dim = atoi (z);
1084 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1085 if (heap)
1087 size_t tmp = atol (heap);
1088 if (tmp)
1089 gcn_kernel_heap_size = tmp;
1092 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1093 if (arena)
1095 int tmp = atoi (arena);
1096 if (tmp)
1097 team_arena_size = tmp;;
1100 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1101 if (stack)
1103 int tmp = atoi (stack);
1104 if (tmp)
1105 stack_size = tmp;;
1108 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1109 if (lowlat)
1110 lowlat_size = atoi (lowlat);
1113 /* Return malloc'd string with name of SYMBOL. */
1115 static char *
1116 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1118 hsa_status_t status;
1119 char *res;
1120 uint32_t len;
1121 const hsa_executable_symbol_info_t info_name_length
1122 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1124 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1125 &len);
1126 if (status != HSA_STATUS_SUCCESS)
1128 hsa_error ("Could not get length of symbol name", status);
1129 return NULL;
1132 res = GOMP_PLUGIN_malloc (len + 1);
1134 const hsa_executable_symbol_info_t info_name
1135 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1137 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1139 if (status != HSA_STATUS_SUCCESS)
1141 hsa_error ("Could not get symbol name", status);
1142 free (res);
1143 return NULL;
1146 res[len] = '\0';
1148 return res;
1151 /* Get the number of GPU Compute Units. */
1153 static int
1154 get_cu_count (struct agent_info *agent)
1156 uint32_t cu_count;
1157 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159 if (status == HSA_STATUS_SUCCESS)
1160 return cu_count;
1161 else
1162 return 64; /* The usual number for older devices. */
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1168 static int
1169 limit_worker_threads (int threads)
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1174 if (threads > 16)
1175 threads = 16;
1176 return threads;
1179 /* This sets the maximum number of teams to twice the number of GPU Compute
1180 Units to avoid memory waste and corresponding memory access faults. */
1182 static int
1183 limit_teams (int teams, struct agent_info *agent)
1185 int max_teams = 2 * get_cu_count (agent);
1186 if (teams > max_teams)
1187 teams = max_teams;
1188 return teams;
1191 /* Parse the target attributes INPUT provided by the compiler and return true
1192 if we should run anything all. If INPUT is NULL, fill DEF with default
1193 values, then store INPUT or DEF into *RESULT.
1195 This is used for OpenMP only. */
1197 static bool
1198 parse_target_attributes (void **input,
1199 struct GOMP_kernel_launch_attributes *def,
1200 struct GOMP_kernel_launch_attributes **result,
1201 struct agent_info *agent)
1203 if (!input)
1204 GOMP_PLUGIN_fatal ("No target arguments provided");
1206 bool grid_attrs_found = false;
1207 bool gcn_dims_found = false;
1208 int gcn_teams = 0;
1209 int gcn_threads = 0;
1210 while (*input)
1212 intptr_t id = (intptr_t) *input++, val;
1214 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1215 val = (intptr_t) *input++;
1216 else
1217 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1219 val = (val > INT_MAX) ? INT_MAX : val;
1221 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1222 && ((id & GOMP_TARGET_ARG_ID_MASK)
1223 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1225 grid_attrs_found = true;
1226 break;
1228 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1229 == GOMP_TARGET_ARG_DEVICE_ALL)
1231 gcn_dims_found = true;
1232 switch (id & GOMP_TARGET_ARG_ID_MASK)
1234 case GOMP_TARGET_ARG_NUM_TEAMS:
1235 gcn_teams = limit_teams (val, agent);
1236 break;
1237 case GOMP_TARGET_ARG_THREAD_LIMIT:
1238 gcn_threads = limit_worker_threads (val);
1239 break;
1240 default:
1246 if (gcn_dims_found)
1248 bool gfx900_workaround_p = false;
1250 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1251 && gcn_threads == 0 && override_z_dim == 0)
1253 gfx900_workaround_p = true;
1254 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1255 "threads to at most 4 per team.\n");
1256 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1257 "GCN_NUM_THREADS=16\n");
1260 /* Ideally, when a dimension isn't explicitly specified, we should
1261 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1262 In practice, we tune for peak performance on BabelStream, which
1263 for OpenACC is currently 32 threads per CU. */
1264 def->ndim = 3;
1265 if (gcn_teams <= 0 && gcn_threads <= 0)
1267 /* Set up a reasonable number of teams and threads. */
1268 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1269 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1270 def->gdims[2] = gcn_threads;
1272 else if (gcn_teams <= 0 && gcn_threads > 0)
1274 /* Auto-scale the number of teams with the number of threads. */
1275 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1276 def->gdims[2] = gcn_threads;
1278 else if (gcn_teams > 0 && gcn_threads <= 0)
1280 int max_threads = gfx900_workaround_p ? 4 : 16;
1282 /* Auto-scale the number of threads with the number of teams. */
1283 def->gdims[0] = gcn_teams;
1284 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1285 if (def->gdims[2] == 0)
1286 def->gdims[2] = 1;
1287 else if (def->gdims[2] > max_threads)
1288 def->gdims[2] = max_threads;
1290 else
1292 def->gdims[0] = gcn_teams;
1293 def->gdims[2] = gcn_threads;
1295 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1296 def->wdims[0] = 1; /* Single team per work-group. */
1297 def->wdims[1] = 64;
1298 def->wdims[2] = 16;
1299 *result = def;
1300 return true;
1302 else if (!grid_attrs_found)
1304 def->ndim = 1;
1305 def->gdims[0] = 1;
1306 def->gdims[1] = 1;
1307 def->gdims[2] = 1;
1308 def->wdims[0] = 1;
1309 def->wdims[1] = 1;
1310 def->wdims[2] = 1;
1311 *result = def;
1312 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1313 return true;
1316 struct GOMP_kernel_launch_attributes *kla;
1317 kla = (struct GOMP_kernel_launch_attributes *) *input;
1318 *result = kla;
1319 if (kla->ndim == 0 || kla->ndim > 3)
1320 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1322 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1323 unsigned i;
1324 for (i = 0; i < kla->ndim; i++)
1326 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1327 kla->gdims[i], kla->wdims[i]);
1328 if (kla->gdims[i] == 0)
1329 return false;
1331 return true;
1334 /* Return the group size given the requested GROUP size, GRID size and number
1335 of grid dimensions NDIM. */
1337 static uint32_t
1338 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1340 if (group == 0)
1342 /* TODO: Provide a default via environment or device characteristics. */
1343 if (ndim == 1)
1344 group = 64;
1345 else if (ndim == 2)
1346 group = 8;
1347 else
1348 group = 4;
1351 if (group > grid)
1352 group = grid;
1353 return group;
1356 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1358 static void
1359 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1361 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1364 /* A never-called callback for the HSA command queues. These signal events
1365 that we don't use, so we trigger an error.
1367 This "queue" is not to be confused with the async queues, below. */
1369 static void
1370 hsa_queue_callback (hsa_status_t status,
1371 hsa_queue_t *queue __attribute__ ((unused)),
1372 void *data __attribute__ ((unused)))
1374 hsa_fatal ("Asynchronous queue error", status);
1377 /* }}} */
1378 /* {{{ HSA initialization */
1380 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1382 static bool
1383 init_hsa_runtime_functions (void)
1385 #define DLSYM_FN(function) \
1386 hsa_fns.function##_fn = dlsym (handle, #function); \
1387 if (hsa_fns.function##_fn == NULL) \
1388 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1389 #define DLSYM_OPT_FN(function) \
1390 hsa_fns.function##_fn = dlsym (handle, #function);
1392 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1393 if (handle == NULL)
1394 return false;
1396 DLSYM_FN (hsa_status_string)
1397 DLSYM_FN (hsa_system_get_info)
1398 DLSYM_FN (hsa_agent_get_info)
1399 DLSYM_FN (hsa_init)
1400 DLSYM_FN (hsa_iterate_agents)
1401 DLSYM_FN (hsa_region_get_info)
1402 DLSYM_FN (hsa_queue_create)
1403 DLSYM_FN (hsa_agent_iterate_regions)
1404 DLSYM_FN (hsa_executable_destroy)
1405 DLSYM_FN (hsa_executable_create)
1406 DLSYM_FN (hsa_executable_global_variable_define)
1407 DLSYM_FN (hsa_executable_load_code_object)
1408 DLSYM_FN (hsa_executable_freeze)
1409 DLSYM_FN (hsa_signal_create)
1410 DLSYM_FN (hsa_memory_allocate)
1411 DLSYM_FN (hsa_memory_assign_agent)
1412 DLSYM_FN (hsa_memory_copy)
1413 DLSYM_FN (hsa_memory_free)
1414 DLSYM_FN (hsa_signal_destroy)
1415 DLSYM_FN (hsa_executable_get_symbol)
1416 DLSYM_FN (hsa_executable_symbol_get_info)
1417 DLSYM_FN (hsa_executable_iterate_symbols)
1418 DLSYM_FN (hsa_queue_add_write_index_release)
1419 DLSYM_FN (hsa_queue_load_read_index_acquire)
1420 DLSYM_FN (hsa_signal_wait_acquire)
1421 DLSYM_FN (hsa_signal_store_relaxed)
1422 DLSYM_FN (hsa_signal_store_release)
1423 DLSYM_FN (hsa_signal_load_acquire)
1424 DLSYM_FN (hsa_queue_destroy)
1425 DLSYM_FN (hsa_code_object_deserialize)
1426 DLSYM_OPT_FN (hsa_amd_memory_lock)
1427 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1428 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1429 return true;
1430 #undef DLSYM_OPT_FN
1431 #undef DLSYM_FN
1434 static gcn_isa isa_code (const char *isa);
1436 /* Return true if the agent is a GPU and can accept of concurrent submissions
1437 from different threads. */
1439 static bool
1440 suitable_hsa_agent_p (hsa_agent_t agent)
1442 hsa_device_type_t device_type;
1443 hsa_status_t status
1444 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1445 &device_type);
1446 if (status != HSA_STATUS_SUCCESS)
1447 return false;
1449 switch (device_type)
1451 case HSA_DEVICE_TYPE_GPU:
1453 char name[64];
1454 hsa_status_t status
1455 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1456 if (status != HSA_STATUS_SUCCESS
1457 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1459 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1460 status == HSA_STATUS_SUCCESS ? name : "invalid");
1461 return false;
1464 break;
1465 case HSA_DEVICE_TYPE_CPU:
1466 if (!support_cpu_devices)
1467 return false;
1468 break;
1469 default:
1470 return false;
1473 uint32_t features = 0;
1474 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1475 &features);
1476 if (status != HSA_STATUS_SUCCESS
1477 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1478 return false;
1479 hsa_queue_type_t queue_type;
1480 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1481 &queue_type);
1482 if (status != HSA_STATUS_SUCCESS
1483 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1484 return false;
1486 return true;
1489 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1490 agent_count in hsa_context. */
1492 static hsa_status_t
1493 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1495 if (suitable_hsa_agent_p (agent))
1496 hsa_context.agent_count++;
1497 return HSA_STATUS_SUCCESS;
1500 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1501 id to the describing structure in the hsa context. The index of the
1502 structure is pointed to by DATA, increment it afterwards. */
1504 static hsa_status_t
1505 assign_agent_ids (hsa_agent_t agent, void *data)
1507 if (suitable_hsa_agent_p (agent))
1509 int *agent_index = (int *) data;
1510 hsa_context.agents[*agent_index].id = agent;
1511 ++*agent_index;
1513 return HSA_STATUS_SUCCESS;
1516 /* Initialize hsa_context if it has not already been done.
1517 If !PROBE: returns TRUE on success.
1518 If PROBE: returns TRUE on success or if the plugin/device shall be silently
1519 ignored, and otherwise emits an error and returns FALSE. */
1521 static bool
1522 init_hsa_context (bool probe)
1524 hsa_status_t status;
1525 int agent_index = 0;
1527 if (hsa_context.initialized)
1528 return true;
1529 init_environment_variables ();
1530 if (!init_hsa_runtime_functions ())
1532 const char *msg = "Run-time could not be dynamically opened";
1533 if (suppress_host_fallback)
1534 GOMP_PLUGIN_fatal ("%s\n", msg);
1535 else
1536 GCN_WARNING ("%s\n", msg);
1537 return probe ? true : false;
1539 status = hsa_fns.hsa_init_fn ();
1540 if (status != HSA_STATUS_SUCCESS)
1541 return hsa_error ("Run-time could not be initialized", status);
1542 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1544 if (debug)
1545 dump_hsa_system_info ();
1547 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1548 if (status != HSA_STATUS_SUCCESS)
1549 return hsa_error ("GCN GPU devices could not be enumerated", status);
1550 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1552 hsa_context.agents
1553 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1554 * sizeof (struct agent_info));
1555 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1556 if (status != HSA_STATUS_SUCCESS)
1557 return hsa_error ("Scanning compute agents failed", status);
1558 if (agent_index != hsa_context.agent_count)
1560 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1561 return false;
1564 if (debug)
1566 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1567 if (status != HSA_STATUS_SUCCESS)
1568 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1571 uint16_t minor, major;
1572 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1573 &minor);
1574 if (status != HSA_STATUS_SUCCESS)
1575 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1576 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1577 &major);
1578 if (status != HSA_STATUS_SUCCESS)
1579 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1581 size_t len = sizeof hsa_context.driver_version_s;
1582 int printed = snprintf (hsa_context.driver_version_s, len,
1583 "HSA Runtime %hu.%hu", (unsigned short int)major,
1584 (unsigned short int)minor);
1585 if (printed >= len)
1586 GCN_WARNING ("HSA runtime version string was truncated."
1587 "Version %hu.%hu is too long.", (unsigned short int)major,
1588 (unsigned short int)minor);
1590 hsa_context.initialized = true;
1591 return true;
1594 /* Verify that hsa_context has already been initialized and return the
1595 agent_info structure describing device number N. Return NULL on error. */
1597 static struct agent_info *
1598 get_agent_info (int n)
1600 if (!hsa_context.initialized)
1602 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1603 return NULL;
1605 if (n >= hsa_context.agent_count)
1607 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1608 return NULL;
1610 if (!hsa_context.agents[n].initialized)
1612 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1613 return NULL;
1615 return &hsa_context.agents[n];
1618 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1620 Selects (breaks at) a suitable region of type KIND. */
1622 static hsa_status_t
1623 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1624 hsa_region_global_flag_t kind)
1626 hsa_status_t status;
1627 hsa_region_segment_t segment;
1629 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1630 &segment);
1631 if (status != HSA_STATUS_SUCCESS)
1632 return status;
1633 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1634 return HSA_STATUS_SUCCESS;
1636 uint32_t flags;
1637 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1638 &flags);
1639 if (status != HSA_STATUS_SUCCESS)
1640 return status;
1641 if (flags & kind)
1643 *retval = region;
1644 return HSA_STATUS_INFO_BREAK;
1646 return HSA_STATUS_SUCCESS;
1649 /* Callback of hsa_agent_iterate_regions.
1651 Selects a kernargs memory region. */
1653 static hsa_status_t
1654 get_kernarg_memory_region (hsa_region_t region, void *data)
1656 return get_memory_region (region, (hsa_region_t *)data,
1657 HSA_REGION_GLOBAL_FLAG_KERNARG);
1660 /* Callback of hsa_agent_iterate_regions.
1662 Selects a coarse-grained memory region suitable for the heap and
1663 offload data. */
1665 static hsa_status_t
1666 get_data_memory_region (hsa_region_t region, void *data)
1668 return get_memory_region (region, (hsa_region_t *)data,
1669 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1672 static int
1673 elf_gcn_isa_field (Elf64_Ehdr *image)
1675 return image->e_flags & EF_AMDGPU_MACH_MASK;
1678 const static char *gcn_gfx803_s = "gfx803";
1679 const static char *gcn_gfx900_s = "gfx900";
1680 const static char *gcn_gfx906_s = "gfx906";
1681 const static char *gcn_gfx908_s = "gfx908";
1682 const static char *gcn_gfx90a_s = "gfx90a";
1683 const static char *gcn_gfx90c_s = "gfx90c";
1684 const static char *gcn_gfx1030_s = "gfx1030";
1685 const static char *gcn_gfx1036_s = "gfx1036";
1686 const static char *gcn_gfx1100_s = "gfx1100";
1687 const static char *gcn_gfx1103_s = "gfx1103";
1688 const static int gcn_isa_name_len = 7;
1690 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1691 support the ISA. */
1693 static const char*
1694 isa_hsa_name (int isa) {
1695 switch(isa)
1697 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1698 return gcn_gfx803_s;
1699 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1700 return gcn_gfx900_s;
1701 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1702 return gcn_gfx906_s;
1703 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1704 return gcn_gfx908_s;
1705 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1706 return gcn_gfx90a_s;
1707 case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1708 return gcn_gfx90c_s;
1709 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1710 return gcn_gfx1030_s;
1711 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1712 return gcn_gfx1036_s;
1713 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1714 return gcn_gfx1100_s;
1715 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1716 return gcn_gfx1103_s;
1718 return NULL;
1721 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1722 with -march) or NULL if we do not support the ISA.
1723 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1725 static const char*
1726 isa_gcc_name (int isa) {
1727 switch(isa)
1729 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1730 return "fiji";
1731 default:
1732 return isa_hsa_name (isa);
1736 /* Returns the code which is used in the GCN object code to identify the ISA with
1737 the given name (as used by the HSA runtime). */
1739 static gcn_isa
1740 isa_code(const char *isa) {
1741 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1742 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1744 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1745 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1747 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1748 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1750 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1751 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1753 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1754 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1756 if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len))
1757 return EF_AMDGPU_MACH_AMDGCN_GFX90c;
1759 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1760 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1762 if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
1763 return EF_AMDGPU_MACH_AMDGCN_GFX1036;
1765 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1766 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1768 if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
1769 return EF_AMDGPU_MACH_AMDGCN_GFX1103;
1771 return EF_AMDGPU_MACH_UNSUPPORTED;
1774 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1776 static int
1777 max_isa_vgprs (int isa)
1779 switch (isa)
1781 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1782 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1783 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1784 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1785 return 256;
1786 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1787 return 512;
1788 case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1789 return 256;
1790 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1791 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1792 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1793 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1794 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1795 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
1797 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1800 /* }}} */
1801 /* {{{ Run */
1803 /* Create or reuse a team arena and stack space.
1805 Team arenas are used by OpenMP to avoid calling malloc multiple times
1806 while setting up each team. This is purely a performance optimization.
1808 The stack space is used by all kernels. We must allocate it in such a
1809 way that the reverse offload implmentation can access the data.
1811 Allocating this memory costs performance, so this function will reuse an
1812 existing allocation if a large enough one is idle.
1813 The memory lock is released, but not deallocated, when the kernel exits. */
1815 static void
1816 configure_ephemeral_memories (struct kernel_info *kernel,
1817 struct kernargs_abi *kernargs, int num_teams,
1818 int num_threads)
1820 struct agent_info *agent = kernel->agent;
1821 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1822 struct ephemeral_memories_list *item;
1824 int actual_arena_size = (kernel->kind == KIND_OPENMP
1825 ? team_arena_size : 0);
1826 int actual_arena_total_size = actual_arena_size * num_teams;
1827 size_t size = (actual_arena_total_size
1828 + num_teams * num_threads * stack_size);
1830 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1832 if (item->size < size)
1833 continue;
1835 if (pthread_mutex_trylock (&item->in_use) == 0)
1836 break;
1839 if (!item)
1841 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1842 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1843 num_teams, num_threads, size);
1845 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1847 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1848 return;
1850 item = malloc (sizeof (*item));
1851 item->size = size;
1852 item->next = NULL;
1853 *next_ptr = item;
1855 if (pthread_mutex_init (&item->in_use, NULL))
1857 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1858 return;
1860 if (pthread_mutex_lock (&item->in_use))
1862 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1863 return;
1865 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1867 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1868 return;
1871 hsa_status_t status;
1872 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1873 &item->address);
1874 if (status != HSA_STATUS_SUCCESS)
1875 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1876 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1877 HSA_ACCESS_PERMISSION_RW);
1878 if (status != HSA_STATUS_SUCCESS)
1879 hsa_fatal ("Could not assign arena & stack memory to device", status);
1882 kernargs->arena_ptr = (actual_arena_total_size
1883 ? (uint64_t)item->address
1884 : 0);
1885 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1886 kernargs->arena_size_per_team = actual_arena_size;
1887 kernargs->stack_size_per_thread = stack_size;
1890 /* Mark an ephemeral memory space available for reuse. */
1892 static void
1893 release_ephemeral_memories (struct agent_info* agent, void *address)
1895 struct ephemeral_memories_list *item;
1897 for (item = agent->ephemeral_memories_list; item; item = item->next)
1899 if (item->address == address)
1901 if (pthread_mutex_unlock (&item->in_use))
1902 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1903 return;
1906 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1909 /* Clean up all the allocated team arenas. */
1911 static bool
1912 destroy_ephemeral_memories (struct agent_info *agent)
1914 struct ephemeral_memories_list *item, *next;
1916 for (item = agent->ephemeral_memories_list; item; item = next)
1918 next = item->next;
1919 hsa_fns.hsa_memory_free_fn (item->address);
1920 if (pthread_mutex_destroy (&item->in_use))
1922 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1923 return false;
1925 free (item);
1927 agent->ephemeral_memories_list = NULL;
1929 return true;
1932 /* Allocate memory on a specified device. */
1934 static void *
1935 alloc_by_agent (struct agent_info *agent, size_t size)
1937 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1939 void *ptr;
1940 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1941 size, &ptr);
1942 if (status != HSA_STATUS_SUCCESS)
1944 hsa_error ("Could not allocate device memory", status);
1945 return NULL;
1948 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1949 HSA_ACCESS_PERMISSION_RW);
1950 if (status != HSA_STATUS_SUCCESS)
1952 hsa_error ("Could not assign data memory to device", status);
1953 return NULL;
1956 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1957 bool profiling_dispatch_p
1958 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1959 if (profiling_dispatch_p)
1961 acc_prof_info *prof_info = thr->prof_info;
1962 acc_event_info data_event_info;
1963 acc_api_info *api_info = thr->api_info;
1965 prof_info->event_type = acc_ev_alloc;
1967 data_event_info.data_event.event_type = prof_info->event_type;
1968 data_event_info.data_event.valid_bytes
1969 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1970 data_event_info.data_event.parent_construct
1971 = acc_construct_parallel;
1972 data_event_info.data_event.implicit = 1;
1973 data_event_info.data_event.tool_info = NULL;
1974 data_event_info.data_event.var_name = NULL;
1975 data_event_info.data_event.bytes = size;
1976 data_event_info.data_event.host_ptr = NULL;
1977 data_event_info.data_event.device_ptr = (void *) ptr;
1979 api_info->device_api = acc_device_api_other;
1981 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1982 api_info);
1985 return ptr;
1988 /* Create kernel dispatch data structure for given KERNEL, along with
1989 the necessary device signals and memory allocations. */
1991 static struct kernel_dispatch *
1992 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1993 int num_threads)
1995 struct agent_info *agent = kernel->agent;
1996 struct kernel_dispatch *shadow
1997 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1999 shadow->agent = kernel->agent;
2000 shadow->object = kernel->object;
2002 hsa_signal_t sync_signal;
2003 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
2004 if (status != HSA_STATUS_SUCCESS)
2005 hsa_fatal ("Error creating the GCN sync signal", status);
2007 shadow->signal = sync_signal.handle;
2008 shadow->private_segment_size = kernel->private_segment_size;
2010 if (lowlat_size < 0)
2012 /* Divide the LDS between the number of running teams.
2013 Allocate not less than is defined in the kernel metadata. */
2014 int teams_per_cu = num_teams / get_cu_count (agent);
2015 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
2016 shadow->group_segment_size
2017 = (kernel->group_segment_size > LDS_per_team
2018 ? kernel->group_segment_size
2019 : LDS_per_team);;
2021 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
2022 /* Ensure that there's space for the OpenMP libgomp data. */
2023 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
2024 else
2025 shadow->group_segment_size = (lowlat_size > 65536
2026 ? 65536
2027 : lowlat_size);
2029 /* We expect kernels to request a single pointer, explicitly, and the
2030 rest of struct kernargs, implicitly. If they request anything else
2031 then something is wrong. */
2032 if (kernel->kernarg_segment_size > 8)
2034 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2035 return NULL;
2038 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2039 sizeof (struct kernargs),
2040 &shadow->kernarg_address);
2041 if (status != HSA_STATUS_SUCCESS)
2042 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2043 struct kernargs *kernargs = shadow->kernarg_address;
2045 /* Zero-initialize the output_data (minimum needed). */
2046 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2047 kernargs->output_data.next_output = 0;
2048 for (unsigned i = 0;
2049 i < (sizeof (kernargs->output_data.queue)
2050 / sizeof (kernargs->output_data.queue[0]));
2051 i++)
2052 kernargs->output_data.queue[i].written = 0;
2053 kernargs->output_data.consumed = 0;
2055 /* Pass in the heap location. */
2056 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2058 /* Create the ephemeral memory spaces. */
2059 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2061 /* Ensure we can recognize unset return values. */
2062 kernargs->output_data.return_value = 0xcafe0000;
2064 return shadow;
2067 static void
2068 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2069 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2071 int dev_num = dev_num64;
2072 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2073 NULL);
2076 /* Output any data written to console output from the kernel. It is expected
2077 that this function is polled during kernel execution.
2079 We print all entries from the last item printed to the next entry without
2080 a "written" flag. If the "final" flag is set then it'll continue right to
2081 the end.
2083 The print buffer is circular, but the from and to locations don't wrap when
2084 the buffer does, so the output limit is UINT_MAX. The target blocks on
2085 output when the buffer is full. */
2087 static void
2088 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2089 bool final)
2091 unsigned int limit = (sizeof (kernargs->output_data.queue)
2092 / sizeof (kernargs->output_data.queue[0]));
2094 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2095 __ATOMIC_ACQUIRE);
2096 unsigned int to = kernargs->output_data.next_output;
2098 if (from > to)
2100 /* Overflow. */
2101 if (final)
2102 printf ("GCN print buffer overflowed.\n");
2103 return;
2106 unsigned int i;
2107 for (i = from; i < to; i++)
2109 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2111 if (!data->written && !final)
2112 break;
2114 switch (data->type)
2116 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2117 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2118 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2119 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2120 case 4:
2121 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2122 data->value_u64[2], data->value_u64[3],
2123 data->value_u64[4], data->value_u64[5]);
2124 break;
2125 default: printf ("GCN print buffer error!\n"); break;
2127 data->written = 0;
2128 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2129 __ATOMIC_RELEASE);
2131 fflush (stdout);
2134 /* Release data structure created for a kernel dispatch in SHADOW argument,
2135 and clean up the signal and memory allocations. */
2137 static void
2138 release_kernel_dispatch (struct kernel_dispatch *shadow)
2140 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2142 struct kernargs *kernargs = shadow->kernarg_address;
2143 void *addr = (void *)kernargs->abi.arena_ptr;
2144 if (!addr)
2145 addr = (void *)kernargs->abi.stack_ptr;
2146 release_ephemeral_memories (shadow->agent, addr);
2148 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2150 hsa_signal_t s;
2151 s.handle = shadow->signal;
2152 hsa_fns.hsa_signal_destroy_fn (s);
2154 free (shadow);
2157 /* Extract the properties from a kernel binary. */
2159 static void
2160 init_kernel_properties (struct kernel_info *kernel)
2162 hsa_status_t status;
2163 struct agent_info *agent = kernel->agent;
2164 hsa_executable_symbol_t kernel_symbol;
2165 char *buf = alloca (strlen (kernel->name) + 4);
2166 sprintf (buf, "%s.kd", kernel->name);
2167 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2168 buf, agent->id,
2169 0, &kernel_symbol);
2170 if (status != HSA_STATUS_SUCCESS)
2172 hsa_warn ("Could not find symbol for kernel in the code object", status);
2173 fprintf (stderr, "not found name: '%s'\n", buf);
2174 dump_executable_symbols (agent->executable);
2175 goto failure;
2177 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2178 status = hsa_fns.hsa_executable_symbol_get_info_fn
2179 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2180 if (status != HSA_STATUS_SUCCESS)
2181 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2182 status = hsa_fns.hsa_executable_symbol_get_info_fn
2183 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2184 &kernel->kernarg_segment_size);
2185 if (status != HSA_STATUS_SUCCESS)
2186 hsa_fatal ("Could not get info about kernel argument size", status);
2187 status = hsa_fns.hsa_executable_symbol_get_info_fn
2188 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2189 &kernel->group_segment_size);
2190 if (status != HSA_STATUS_SUCCESS)
2191 hsa_fatal ("Could not get info about kernel group segment size", status);
2192 status = hsa_fns.hsa_executable_symbol_get_info_fn
2193 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2194 &kernel->private_segment_size);
2195 if (status != HSA_STATUS_SUCCESS)
2196 hsa_fatal ("Could not get info about kernel private segment size",
2197 status);
2199 /* The kernel type is not known until something tries to launch it. */
2200 kernel->kind = KIND_UNKNOWN;
2202 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2203 "following segment sizes: \n", kernel->name);
2204 GCN_DEBUG (" group_segment_size: %u\n",
2205 (unsigned) kernel->group_segment_size);
2206 GCN_DEBUG (" private_segment_size: %u\n",
2207 (unsigned) kernel->private_segment_size);
2208 GCN_DEBUG (" kernarg_segment_size: %u\n",
2209 (unsigned) kernel->kernarg_segment_size);
2210 return;
2212 failure:
2213 kernel->initialization_failed = true;
2216 /* Do all the work that is necessary before running KERNEL for the first time.
2217 The function assumes the program has been created, finalized and frozen by
2218 create_and_finalize_hsa_program. */
2220 static void
2221 init_kernel (struct kernel_info *kernel)
2223 if (pthread_mutex_lock (&kernel->init_mutex))
2224 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2225 if (kernel->initialized)
2227 if (pthread_mutex_unlock (&kernel->init_mutex))
2228 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2229 "mutex");
2231 return;
2234 init_kernel_properties (kernel);
2236 if (!kernel->initialization_failed)
2238 GCN_DEBUG ("\n");
2240 kernel->initialized = true;
2242 if (pthread_mutex_unlock (&kernel->init_mutex))
2243 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2244 "mutex");
2247 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2248 launch attributes from KLA.
2250 MODULE_LOCKED indicates that the caller already holds the lock and
2251 run_kernel need not lock it again.
2252 If AQ is NULL then agent->sync_queue will be used. */
2254 static void
2255 run_kernel (struct kernel_info *kernel, void *vars,
2256 struct GOMP_kernel_launch_attributes *kla,
2257 struct goacc_asyncqueue *aq, bool module_locked)
2259 struct agent_info *agent = kernel->agent;
2260 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2261 kernel->description->vpgr_count);
2263 /* Reduce the number of threads/workers if there are insufficient
2264 VGPRs available to run the kernels together. */
2265 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2267 int max_vgprs = max_isa_vgprs (agent->device_isa);
2268 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2269 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2270 if (kla->gdims[2] > max_threads)
2272 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2273 " per team/gang - reducing to %d threads/workers.\n",
2274 kla->gdims[2], max_threads);
2275 kla->gdims[2] = max_threads;
2279 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2280 (aq ? aq->id : 0));
2281 GCN_DEBUG ("GCN launch attribs: gdims:[");
2282 int i;
2283 for (i = 0; i < kla->ndim; ++i)
2285 if (i)
2286 DEBUG_PRINT (", ");
2287 DEBUG_PRINT ("%u", kla->gdims[i]);
2289 DEBUG_PRINT ("], normalized gdims:[");
2290 for (i = 0; i < kla->ndim; ++i)
2292 if (i)
2293 DEBUG_PRINT (", ");
2294 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2296 DEBUG_PRINT ("], wdims:[");
2297 for (i = 0; i < kla->ndim; ++i)
2299 if (i)
2300 DEBUG_PRINT (", ");
2301 DEBUG_PRINT ("%u", kla->wdims[i]);
2303 DEBUG_PRINT ("]\n");
2304 DEBUG_FLUSH ();
2306 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2307 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2309 if (!agent->initialized)
2310 GOMP_PLUGIN_fatal ("Agent must be initialized");
2312 if (!kernel->initialized)
2313 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2315 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2317 uint64_t index
2318 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2319 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2321 /* Wait until the queue is not full before writing the packet. */
2322 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2323 >= command_q->size)
2326 /* Do not allow the dimensions to be overridden when running
2327 constructors or destructors. */
2328 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2329 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2331 hsa_kernel_dispatch_packet_t *packet;
2332 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2333 + index % command_q->size;
2335 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2336 packet->grid_size_x = override_x ? : kla->gdims[0];
2337 packet->workgroup_size_x = get_group_size (kla->ndim,
2338 packet->grid_size_x,
2339 kla->wdims[0]);
2341 if (kla->ndim >= 2)
2343 packet->grid_size_y = kla->gdims[1];
2344 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2345 kla->wdims[1]);
2347 else
2349 packet->grid_size_y = 1;
2350 packet->workgroup_size_y = 1;
2353 if (kla->ndim == 3)
2355 packet->grid_size_z = limit_worker_threads (override_z
2356 ? : kla->gdims[2]);
2357 packet->workgroup_size_z = get_group_size (kla->ndim,
2358 packet->grid_size_z,
2359 kla->wdims[2]);
2361 else
2363 packet->grid_size_z = 1;
2364 packet->workgroup_size_z = 1;
2367 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2368 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2369 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2370 packet->grid_size_x / packet->workgroup_size_x,
2371 packet->grid_size_y / packet->workgroup_size_y,
2372 packet->grid_size_z / packet->workgroup_size_z,
2373 packet->workgroup_size_x, packet->workgroup_size_y,
2374 packet->workgroup_size_z);
2376 struct kernel_dispatch *shadow
2377 = create_kernel_dispatch (kernel, packet->grid_size_x,
2378 packet->grid_size_z);
2379 shadow->queue = command_q;
2381 if (debug)
2383 fprintf (stderr, "\nKernel has following dependencies:\n");
2384 print_kernel_dispatch (shadow, 2);
2387 packet->private_segment_size = shadow->private_segment_size;
2388 packet->group_segment_size = shadow->group_segment_size;
2389 packet->kernel_object = shadow->object;
2390 packet->kernarg_address = shadow->kernarg_address;
2391 hsa_signal_t s;
2392 s.handle = shadow->signal;
2393 packet->completion_signal = s;
2394 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2395 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2397 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2399 uint16_t header;
2400 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2401 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2402 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2404 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2405 agent->device_id);
2407 packet_store_release ((uint32_t *) packet, header,
2408 (uint16_t) kla->ndim
2409 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2411 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2412 index);
2414 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2416 /* Root signal waits with 1ms timeout. */
2417 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2418 1000 * 1000,
2419 HSA_WAIT_STATE_BLOCKED) != 0)
2421 console_output (kernel, shadow->kernarg_address, false);
2423 console_output (kernel, shadow->kernarg_address, true);
2425 struct kernargs *kernargs = shadow->kernarg_address;
2426 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2428 release_kernel_dispatch (shadow);
2430 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2431 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2433 unsigned int upper = (return_value & ~0xffff) >> 16;
2434 if (upper == 0xcafe)
2435 ; // exit not called, normal termination.
2436 else if (upper == 0xffff)
2437 ; // exit called.
2438 else
2440 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2441 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2442 return_value);
2443 abort ();
2446 if (upper == 0xffff)
2448 unsigned int signal = (return_value >> 8) & 0xff;
2450 if (signal == SIGABRT)
2452 GCN_WARNING ("GCN Kernel aborted\n");
2453 abort ();
2455 else if (signal != 0)
2457 GCN_WARNING ("GCN Kernel received unknown signal\n");
2458 abort ();
2461 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2462 exit (return_value & 0xff);
2466 /* }}} */
2467 /* {{{ Load/Unload */
2469 /* Initialize KERNEL from D and other parameters. Return true on success. */
2471 static bool
2472 init_basic_kernel_info (struct kernel_info *kernel,
2473 struct hsa_kernel_description *d,
2474 struct agent_info *agent,
2475 struct module_info *module)
2477 kernel->agent = agent;
2478 kernel->module = module;
2479 kernel->name = d->name;
2480 kernel->description = d;
2481 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2483 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2484 return false;
2486 return true;
2489 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2491 static bool
2492 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2494 int isa_field = elf_gcn_isa_field (image);
2495 const char* isa_s = isa_hsa_name (isa_field);
2496 if (!isa_s)
2498 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2499 return false;
2502 if (isa_field != agent->device_isa)
2504 char msg[120];
2505 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2506 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2507 assert (agent_isa_s);
2508 assert (agent_isa_gcc_s);
2510 snprintf (msg, sizeof msg,
2511 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2512 "Try to recompile with '-foffload-options=-march=%s'.\n",
2513 isa_s, agent_isa_s, agent_isa_gcc_s);
2515 hsa_error (msg, HSA_STATUS_ERROR);
2516 return false;
2519 return true;
2522 /* Create and finalize the program consisting of all loaded modules. */
2524 static bool
2525 create_and_finalize_hsa_program (struct agent_info *agent)
2527 hsa_status_t status;
2528 bool res = true;
2529 if (pthread_mutex_lock (&agent->prog_mutex))
2531 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2532 return false;
2534 if (agent->prog_finalized)
2535 goto final;
2537 status
2538 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2539 HSA_EXECUTABLE_STATE_UNFROZEN,
2540 "", &agent->executable);
2541 if (status != HSA_STATUS_SUCCESS)
2543 hsa_error ("Could not create GCN executable", status);
2544 goto fail;
2547 /* Load any GCN modules. */
2548 struct module_info *module = agent->module;
2549 if (module)
2551 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2553 if (!isa_matches_agent (agent, image))
2554 goto fail;
2556 hsa_code_object_t co = { 0 };
2557 status = hsa_fns.hsa_code_object_deserialize_fn
2558 (module->image_desc->gcn_image->image,
2559 module->image_desc->gcn_image->size,
2560 NULL, &co);
2561 if (status != HSA_STATUS_SUCCESS)
2563 hsa_error ("Could not deserialize GCN code object", status);
2564 goto fail;
2567 status = hsa_fns.hsa_executable_load_code_object_fn
2568 (agent->executable, agent->id, co, "");
2569 if (status != HSA_STATUS_SUCCESS)
2571 hsa_error ("Could not load GCN code object", status);
2572 goto fail;
2575 if (!module->heap)
2577 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2578 gcn_kernel_heap_size,
2579 (void**)&module->heap);
2580 if (status != HSA_STATUS_SUCCESS)
2582 hsa_error ("Could not allocate memory for GCN heap", status);
2583 goto fail;
2586 status = hsa_fns.hsa_memory_assign_agent_fn
2587 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2588 if (status != HSA_STATUS_SUCCESS)
2590 hsa_error ("Could not assign GCN heap memory to device", status);
2591 goto fail;
2594 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2595 &gcn_kernel_heap_size,
2596 sizeof (gcn_kernel_heap_size));
2601 if (debug)
2602 dump_executable_symbols (agent->executable);
2604 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2605 if (status != HSA_STATUS_SUCCESS)
2607 hsa_error ("Could not freeze the GCN executable", status);
2608 goto fail;
2611 final:
2612 agent->prog_finalized = true;
2614 if (pthread_mutex_unlock (&agent->prog_mutex))
2616 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2617 res = false;
2620 return res;
2622 fail:
2623 res = false;
2624 goto final;
2627 /* Free the HSA program in agent and everything associated with it and set
2628 agent->prog_finalized and the initialized flags of all kernels to false.
2629 Return TRUE on success. */
2631 static bool
2632 destroy_hsa_program (struct agent_info *agent)
2634 if (!agent->prog_finalized)
2635 return true;
2637 hsa_status_t status;
2639 GCN_DEBUG ("Destroying the current GCN program.\n");
2641 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2642 if (status != HSA_STATUS_SUCCESS)
2643 return hsa_error ("Could not destroy GCN executable", status);
2645 if (agent->module)
2647 int i;
2648 for (i = 0; i < agent->module->kernel_count; i++)
2649 agent->module->kernels[i].initialized = false;
2651 if (agent->module->heap)
2653 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2654 agent->module->heap = NULL;
2657 agent->prog_finalized = false;
2658 return true;
2661 /* Deinitialize all information associated with MODULE and kernels within
2662 it. Return TRUE on success. */
2664 static bool
2665 destroy_module (struct module_info *module, bool locked)
2667 /* Run destructors before destroying module. */
2668 struct GOMP_kernel_launch_attributes kla =
2669 { 3,
2670 /* Grid size. */
2671 { 1, 64, 1 },
2672 /* Work-group size. */
2673 { 1, 64, 1 }
2676 if (module->fini_array_func)
2678 init_kernel (module->fini_array_func);
2679 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2681 module->constructors_run_p = false;
2683 int i;
2684 for (i = 0; i < module->kernel_count; i++)
2685 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2687 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2688 "mutex");
2689 return false;
2692 return true;
2695 /* }}} */
2696 /* {{{ Async */
2698 /* Callback of dispatch queues to report errors. */
2700 static void
2701 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2703 struct queue_entry *entry = &aq->queue[index];
2705 switch (entry->type)
2707 case KERNEL_LAUNCH:
2708 if (DEBUG_QUEUES)
2709 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2710 aq->agent->device_id, aq->id, index);
2711 run_kernel (entry->u.launch.kernel,
2712 entry->u.launch.vars,
2713 &entry->u.launch.kla, aq, false);
2714 if (DEBUG_QUEUES)
2715 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2716 aq->agent->device_id, aq->id, index);
2717 break;
2719 case CALLBACK:
2720 if (DEBUG_QUEUES)
2721 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2722 aq->agent->device_id, aq->id, index);
2723 entry->u.callback.fn (entry->u.callback.data);
2724 if (DEBUG_QUEUES)
2725 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2726 aq->agent->device_id, aq->id, index);
2727 break;
2729 case ASYNC_WAIT:
2731 /* FIXME: is it safe to access a placeholder that may already have
2732 been executed? */
2733 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2735 if (DEBUG_QUEUES)
2736 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2737 aq->agent->device_id, aq->id, index);
2739 pthread_mutex_lock (&placeholderp->mutex);
2741 while (!placeholderp->executed)
2742 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2744 pthread_mutex_unlock (&placeholderp->mutex);
2746 if (pthread_cond_destroy (&placeholderp->cond))
2747 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2749 if (pthread_mutex_destroy (&placeholderp->mutex))
2750 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2752 if (DEBUG_QUEUES)
2753 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2754 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2756 break;
2758 case ASYNC_PLACEHOLDER:
2759 pthread_mutex_lock (&entry->u.placeholder.mutex);
2760 entry->u.placeholder.executed = 1;
2761 pthread_cond_signal (&entry->u.placeholder.cond);
2762 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2763 break;
2765 default:
2766 GOMP_PLUGIN_fatal ("Unknown queue element");
2770 /* This function is run as a thread to service an async queue in the
2771 background. It runs continuously until the stop flag is set. */
2773 static void *
2774 drain_queue (void *thread_arg)
2776 struct goacc_asyncqueue *aq = thread_arg;
2778 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2780 aq->drain_queue_stop = 2;
2781 return NULL;
2784 pthread_mutex_lock (&aq->mutex);
2786 while (true)
2788 if (aq->drain_queue_stop)
2789 break;
2791 if (aq->queue_n > 0)
2793 pthread_mutex_unlock (&aq->mutex);
2794 execute_queue_entry (aq, aq->queue_first);
2796 pthread_mutex_lock (&aq->mutex);
2797 aq->queue_first = ((aq->queue_first + 1)
2798 % ASYNC_QUEUE_SIZE);
2799 aq->queue_n--;
2801 if (DEBUG_THREAD_SIGNAL)
2802 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2803 aq->agent->device_id, aq->id);
2804 pthread_cond_broadcast (&aq->queue_cond_out);
2805 pthread_mutex_unlock (&aq->mutex);
2807 if (DEBUG_QUEUES)
2808 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2809 aq->id);
2810 pthread_mutex_lock (&aq->mutex);
2812 else
2814 if (DEBUG_THREAD_SLEEP)
2815 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2816 aq->agent->device_id, aq->id);
2817 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2818 if (DEBUG_THREAD_SLEEP)
2819 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2820 aq->agent->device_id, aq->id);
2824 aq->drain_queue_stop = 2;
2825 if (DEBUG_THREAD_SIGNAL)
2826 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2827 aq->agent->device_id, aq->id);
2828 pthread_cond_broadcast (&aq->queue_cond_out);
2829 pthread_mutex_unlock (&aq->mutex);
2831 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2832 return NULL;
2835 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2836 is not usually the case. This is just a debug tool. */
2838 static void
2839 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2841 pthread_mutex_lock (&aq->mutex);
2843 while (aq->queue_n > 0)
2845 execute_queue_entry (aq, aq->queue_first);
2847 aq->queue_first = ((aq->queue_first + 1)
2848 % ASYNC_QUEUE_SIZE);
2849 aq->queue_n--;
2852 pthread_mutex_unlock (&aq->mutex);
2855 /* Block the current thread until an async queue is writable. The aq->mutex
2856 lock should be held on entry, and remains locked on exit. */
2858 static void
2859 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2861 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2863 /* Queue is full. Wait for it to not be full. */
2864 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2865 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2869 /* Request an asynchronous kernel launch on the specified queue. This
2870 may block if the queue is full, but returns without waiting for the
2871 kernel to run. */
2873 static void
2874 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2875 void *vars, struct GOMP_kernel_launch_attributes *kla)
2877 assert (aq->agent == kernel->agent);
2879 pthread_mutex_lock (&aq->mutex);
2881 wait_for_queue_nonfull (aq);
2883 int queue_last = ((aq->queue_first + aq->queue_n)
2884 % ASYNC_QUEUE_SIZE);
2885 if (DEBUG_QUEUES)
2886 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2887 aq->id, queue_last);
2889 aq->queue[queue_last].type = KERNEL_LAUNCH;
2890 aq->queue[queue_last].u.launch.kernel = kernel;
2891 aq->queue[queue_last].u.launch.vars = vars;
2892 aq->queue[queue_last].u.launch.kla = *kla;
2894 aq->queue_n++;
2896 if (DEBUG_THREAD_SIGNAL)
2897 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2898 aq->agent->device_id, aq->id);
2899 pthread_cond_signal (&aq->queue_cond_in);
2901 pthread_mutex_unlock (&aq->mutex);
2904 /* Request an asynchronous callback on the specified queue. The callback
2905 function will be called, with the given opaque data, from the appropriate
2906 async thread, when all previous items on that queue are complete. */
2908 static void
2909 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2910 void *data)
2912 pthread_mutex_lock (&aq->mutex);
2914 wait_for_queue_nonfull (aq);
2916 int queue_last = ((aq->queue_first + aq->queue_n)
2917 % ASYNC_QUEUE_SIZE);
2918 if (DEBUG_QUEUES)
2919 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2920 aq->id, queue_last);
2922 aq->queue[queue_last].type = CALLBACK;
2923 aq->queue[queue_last].u.callback.fn = fn;
2924 aq->queue[queue_last].u.callback.data = data;
2926 aq->queue_n++;
2928 if (DEBUG_THREAD_SIGNAL)
2929 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2930 aq->agent->device_id, aq->id);
2931 pthread_cond_signal (&aq->queue_cond_in);
2933 pthread_mutex_unlock (&aq->mutex);
2936 /* Request that a given async thread wait for another thread (unspecified) to
2937 reach the given placeholder. The wait will occur when all previous entries
2938 on the queue are complete. A placeholder is effectively a kind of signal
2939 which simply sets a flag when encountered in a queue. */
2941 static void
2942 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2943 struct placeholder *placeholderp)
2945 pthread_mutex_lock (&aq->mutex);
2947 wait_for_queue_nonfull (aq);
2949 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2950 if (DEBUG_QUEUES)
2951 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2952 aq->id, queue_last);
2954 aq->queue[queue_last].type = ASYNC_WAIT;
2955 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2957 aq->queue_n++;
2959 if (DEBUG_THREAD_SIGNAL)
2960 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2961 aq->agent->device_id, aq->id);
2962 pthread_cond_signal (&aq->queue_cond_in);
2964 pthread_mutex_unlock (&aq->mutex);
2967 /* Add a placeholder into an async queue. When the async thread reaches the
2968 placeholder it will set the "executed" flag to true and continue.
2969 Another thread may be waiting on this thread reaching the placeholder. */
2971 static struct placeholder *
2972 queue_push_placeholder (struct goacc_asyncqueue *aq)
2974 struct placeholder *placeholderp;
2976 pthread_mutex_lock (&aq->mutex);
2978 wait_for_queue_nonfull (aq);
2980 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2981 if (DEBUG_QUEUES)
2982 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2983 aq->id, queue_last);
2985 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2986 placeholderp = &aq->queue[queue_last].u.placeholder;
2988 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2990 pthread_mutex_unlock (&aq->mutex);
2991 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2994 if (pthread_cond_init (&placeholderp->cond, NULL))
2996 pthread_mutex_unlock (&aq->mutex);
2997 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3000 placeholderp->executed = 0;
3002 aq->queue_n++;
3004 if (DEBUG_THREAD_SIGNAL)
3005 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3006 aq->agent->device_id, aq->id);
3007 pthread_cond_signal (&aq->queue_cond_in);
3009 pthread_mutex_unlock (&aq->mutex);
3011 return placeholderp;
3014 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
3016 static void
3017 finalize_async_thread (struct goacc_asyncqueue *aq)
3019 pthread_mutex_lock (&aq->mutex);
3020 if (aq->drain_queue_stop == 2)
3022 pthread_mutex_unlock (&aq->mutex);
3023 return;
3026 aq->drain_queue_stop = 1;
3028 if (DEBUG_THREAD_SIGNAL)
3029 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3030 aq->agent->device_id, aq->id);
3031 pthread_cond_signal (&aq->queue_cond_in);
3033 while (aq->drain_queue_stop != 2)
3035 if (DEBUG_THREAD_SLEEP)
3036 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3037 " to sleep\n", aq->agent->device_id, aq->id);
3038 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3039 if (DEBUG_THREAD_SLEEP)
3040 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3041 aq->agent->device_id, aq->id);
3044 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3045 aq->id);
3046 pthread_mutex_unlock (&aq->mutex);
3048 int err = pthread_join (aq->thread_drain_queue, NULL);
3049 if (err != 0)
3050 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3051 aq->agent->device_id, aq->id, strerror (err));
3052 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3055 /* Set up an async queue for OpenMP. There will be only one. The
3056 implementation simply uses an OpenACC async queue.
3057 FIXME: is this thread-safe if two threads call this function? */
3059 static void
3060 maybe_init_omp_async (struct agent_info *agent)
3062 if (!agent->omp_async_queue)
3063 agent->omp_async_queue
3064 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3067 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3068 copies from read-only pages. */
3070 static void
3071 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3073 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3075 if (status == HSA_STATUS_SUCCESS)
3076 return;
3078 /* It appears that the copy fails if the source data is in a read-only page.
3079 We can't detect that easily, so try copying the data to a temporary buffer
3080 and doing the copy again if we got an error above. */
3082 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3083 "[%p:+%d]\n", (void *) src, (int) len);
3085 void *src_copy = malloc (len);
3086 memcpy (src_copy, src, len);
3087 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3088 free (src_copy);
3089 if (status != HSA_STATUS_SUCCESS)
3090 GOMP_PLUGIN_error ("memory copy failed");
3093 /* Copy data to or from a device. This is intended for use as an async
3094 callback event. */
3096 static void
3097 copy_data (void *data_)
3099 struct copy_data *data = (struct copy_data *)data_;
3100 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3101 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3102 data->dst);
3103 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3104 free (data);
3107 /* Request an asynchronous data copy, to or from a device, on a given queue.
3108 The event will be registered as a callback. */
3110 static void
3111 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3112 size_t len)
3114 if (DEBUG_QUEUES)
3115 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3116 aq->agent->device_id, aq->id, len, src, dst);
3117 struct copy_data *data
3118 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3119 data->dst = dst;
3120 data->src = src;
3121 data->len = len;
3122 data->aq = aq;
3123 queue_push_callback (aq, copy_data, data);
3126 /* Return true if the given queue is currently empty. */
3128 static int
3129 queue_empty (struct goacc_asyncqueue *aq)
3131 pthread_mutex_lock (&aq->mutex);
3132 int res = aq->queue_n == 0 ? 1 : 0;
3133 pthread_mutex_unlock (&aq->mutex);
3135 return res;
3138 /* Wait for a given queue to become empty. This implements an OpenACC wait
3139 directive. */
3141 static void
3142 wait_queue (struct goacc_asyncqueue *aq)
3144 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3146 drain_queue_synchronous (aq);
3147 return;
3150 pthread_mutex_lock (&aq->mutex);
3152 while (aq->queue_n > 0)
3154 if (DEBUG_THREAD_SLEEP)
3155 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3156 aq->agent->device_id, aq->id);
3157 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3158 if (DEBUG_THREAD_SLEEP)
3159 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3160 aq->id);
3163 pthread_mutex_unlock (&aq->mutex);
3164 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3167 /* }}} */
3168 /* {{{ OpenACC support */
3170 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3172 static void
3173 gcn_exec (struct kernel_info *kernel,
3174 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3175 struct goacc_asyncqueue *aq)
3177 if (!GOMP_OFFLOAD_can_run (kernel))
3178 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3180 /* If we get here then this must be an OpenACC kernel. */
3181 kernel->kind = KIND_OPENACC;
3183 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3184 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3186 struct hsa_kernel_description *d
3187 = &kernel->module->image_desc->kernel_infos[i];
3188 if (d->name == kernel->name)
3190 hsa_kernel_desc = d;
3191 break;
3195 /* We may have statically-determined dimensions in
3196 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3197 invocation at runtime in dims[]. We allow static dimensions to take
3198 priority over dynamic dimensions when present (non-zero). */
3199 if (hsa_kernel_desc->oacc_dims[0] > 0)
3200 dims[0] = hsa_kernel_desc->oacc_dims[0];
3201 if (hsa_kernel_desc->oacc_dims[1] > 0)
3202 dims[1] = hsa_kernel_desc->oacc_dims[1];
3203 if (hsa_kernel_desc->oacc_dims[2] > 0)
3204 dims[2] = hsa_kernel_desc->oacc_dims[2];
3206 /* Ideally, when a dimension isn't explicitly specified, we should
3207 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3208 In practice, we tune for peak performance on BabelStream, which
3209 for OpenACC is currently 32 threads per CU. */
3210 if (dims[0] == 0 && dims[1] == 0)
3212 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3213 number. There isn't really a correct answer for this without a clue
3214 about the problem size, so let's do a reasonable number of workers
3215 and gangs. */
3217 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3218 dims[1] = 8; /* Workers. */
3220 else if (dims[0] == 0 && dims[1] > 0)
3222 /* Auto-scale the number of gangs with the requested number of workers. */
3223 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3225 else if (dims[0] > 0 && dims[1] == 0)
3227 /* Auto-scale the number of workers with the requested number of gangs. */
3228 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3229 if (dims[1] == 0)
3230 dims[1] = 1;
3231 if (dims[1] > 16)
3232 dims[1] = 16;
3235 /* The incoming dimensions are expressed in terms of gangs, workers, and
3236 vectors. The HSA dimensions are expressed in terms of "work-items",
3237 which means multiples of vector lanes.
3239 The "grid size" specifies the size of the problem space, and the
3240 "work-group size" specifies how much of that we want a single compute
3241 unit to chew on at once.
3243 The three dimensions do not really correspond to hardware, but the
3244 important thing is that the HSA runtime will launch as many
3245 work-groups as it takes to process the entire grid, and each
3246 work-group will contain as many wave-fronts as it takes to process
3247 the work-items in that group.
3249 Essentially, as long as we set the Y dimension to 64 (the number of
3250 vector lanes in hardware), and the Z group size to the maximum (16),
3251 then we will get the gangs (X) and workers (Z) launched as we expect.
3253 The reason for the apparent reversal of vector and worker dimension
3254 order is to do with the way the run-time distributes work-items across
3255 v1 and v2. */
3256 struct GOMP_kernel_launch_attributes kla =
3258 /* Grid size. */
3259 {dims[0], 64, dims[1]},
3260 /* Work-group size. */
3261 {1, 64, 16}
3264 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3265 acc_prof_info *prof_info = thr->prof_info;
3266 acc_event_info enqueue_launch_event_info;
3267 acc_api_info *api_info = thr->api_info;
3268 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3269 if (profiling_dispatch_p)
3271 prof_info->event_type = acc_ev_enqueue_launch_start;
3273 enqueue_launch_event_info.launch_event.event_type
3274 = prof_info->event_type;
3275 enqueue_launch_event_info.launch_event.valid_bytes
3276 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3277 enqueue_launch_event_info.launch_event.parent_construct
3278 = acc_construct_parallel;
3279 enqueue_launch_event_info.launch_event.implicit = 1;
3280 enqueue_launch_event_info.launch_event.tool_info = NULL;
3281 enqueue_launch_event_info.launch_event.kernel_name
3282 = (char *) kernel->name;
3283 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3284 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3285 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3287 api_info->device_api = acc_device_api_other;
3289 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3290 &enqueue_launch_event_info, api_info);
3293 if (!async)
3294 run_kernel (kernel, devaddrs, &kla, NULL, false);
3295 else
3296 queue_push_launch (aq, kernel, devaddrs, &kla);
3298 if (profiling_dispatch_p)
3300 prof_info->event_type = acc_ev_enqueue_launch_end;
3301 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3302 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3303 &enqueue_launch_event_info,
3304 api_info);
3308 /* }}} */
3309 /* {{{ Generic Plugin API */
3311 /* Return the name of the accelerator, which is "gcn". */
3313 const char *
3314 GOMP_OFFLOAD_get_name (void)
3316 return "gcn";
3319 /* Return the specific capabilities the HSA accelerator have. */
3321 unsigned int
3322 GOMP_OFFLOAD_get_caps (void)
3324 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3325 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3326 | GOMP_OFFLOAD_CAP_OPENACC_200;
3329 /* Identify as GCN accelerator. */
3332 GOMP_OFFLOAD_get_type (void)
3334 return OFFLOAD_TARGET_TYPE_GCN;
3337 /* Return the libgomp version number we're compatible with. There is
3338 no requirement for cross-version compatibility. */
3340 unsigned
3341 GOMP_OFFLOAD_version (void)
3343 return GOMP_VERSION;
3346 /* Return the number of GCN devices on the system. */
3349 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3351 if (!init_hsa_context (true))
3352 exit (EXIT_FAILURE);
3353 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3354 devices were present. */
3355 if (hsa_context.agent_count > 0
3356 && ((omp_requires_mask
3357 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3358 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3359 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3360 return -1;
3361 /* Check whether host page access is supported; this is per system level
3362 (all GPUs supported by HSA). While intrinsically true for APUs, it
3363 requires XNACK support for discrete GPUs. */
3364 if (hsa_context.agent_count > 0
3365 && (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY))
3367 bool b;
3368 hsa_system_info_t type = HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT;
3369 hsa_status_t status = hsa_fns.hsa_system_get_info_fn (type, &b);
3370 if (status != HSA_STATUS_SUCCESS)
3371 GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3372 "failed");
3373 if (!b)
3374 return -1;
3377 return hsa_context.agent_count;
3380 /* Initialize device (agent) number N so that it can be used for computation.
3381 Return TRUE on success. */
3383 bool
3384 GOMP_OFFLOAD_init_device (int n)
3386 if (!init_hsa_context (false))
3387 return false;
3388 if (n >= hsa_context.agent_count)
3390 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3391 return false;
3393 struct agent_info *agent = &hsa_context.agents[n];
3395 if (agent->initialized)
3396 return true;
3398 agent->device_id = n;
3400 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3402 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3403 return false;
3405 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3407 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3408 return false;
3410 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3412 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3413 return false;
3415 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3417 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3418 return false;
3420 agent->async_queues = NULL;
3421 agent->omp_async_queue = NULL;
3422 agent->ephemeral_memories_list = NULL;
3424 uint32_t queue_size;
3425 hsa_status_t status;
3426 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3427 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3428 &queue_size);
3429 if (status != HSA_STATUS_SUCCESS)
3430 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3431 status);
3433 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3434 &agent->name);
3435 if (status != HSA_STATUS_SUCCESS)
3436 return hsa_error ("Error querying the name of the agent", status);
3438 agent->device_isa = isa_code (agent->name);
3439 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3440 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3442 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3443 &agent->vendor_name);
3444 if (status != HSA_STATUS_SUCCESS)
3445 return hsa_error ("Error querying the vendor name of the agent", status);
3447 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3448 HSA_QUEUE_TYPE_MULTI,
3449 hsa_queue_callback, NULL, UINT32_MAX,
3450 UINT32_MAX, &agent->sync_queue);
3451 if (status != HSA_STATUS_SUCCESS)
3452 return hsa_error ("Error creating command queue", status);
3454 agent->kernarg_region.handle = (uint64_t) -1;
3455 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3456 get_kernarg_memory_region,
3457 &agent->kernarg_region);
3458 if (status != HSA_STATUS_SUCCESS
3459 && status != HSA_STATUS_INFO_BREAK)
3460 hsa_error ("Scanning memory regions failed", status);
3461 if (agent->kernarg_region.handle == (uint64_t) -1)
3463 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3464 "arguments");
3465 return false;
3467 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3468 dump_hsa_region (agent->kernarg_region, NULL);
3470 agent->data_region.handle = (uint64_t) -1;
3471 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3472 get_data_memory_region,
3473 &agent->data_region);
3474 if (status != HSA_STATUS_SUCCESS
3475 && status != HSA_STATUS_INFO_BREAK)
3476 hsa_error ("Scanning memory regions failed", status);
3477 if (agent->data_region.handle == (uint64_t) -1)
3479 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3480 "data");
3481 return false;
3483 GCN_DEBUG ("Selected device data memory region:\n");
3484 dump_hsa_region (agent->data_region, NULL);
3486 GCN_DEBUG ("GCN agent %d initialized\n", n);
3488 agent->initialized = true;
3489 return true;
3492 /* Load GCN object-code module described by struct gcn_image_desc in
3493 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3494 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3495 contain the on-device addresses of the functions for reverse offload. To be
3496 freed by the caller. */
3499 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3500 struct addr_pair **target_table,
3501 uint64_t **rev_fn_table,
3502 uint64_t *host_ind_fn_table)
3504 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3506 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3507 " (expected %u, received %u)",
3508 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3509 return -1;
3512 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3513 struct agent_info *agent;
3514 struct addr_pair *pair;
3515 struct module_info *module;
3516 struct kernel_info *kernel;
3517 int kernel_count = image_desc->kernel_count;
3518 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3519 ? image_desc->ind_func_count : 0;
3520 unsigned var_count = image_desc->global_variable_count;
3521 /* Currently, "others" is a struct of ICVS. */
3522 int other_count = 1;
3524 agent = get_agent_info (ord);
3525 if (!agent)
3526 return -1;
3528 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3530 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3531 return -1;
3533 if (agent->prog_finalized
3534 && !destroy_hsa_program (agent))
3535 return -1;
3537 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3538 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3539 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3540 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3541 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3542 * sizeof (struct addr_pair));
3543 *target_table = pair;
3544 module = (struct module_info *)
3545 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3546 + kernel_count * sizeof (struct kernel_info));
3547 module->image_desc = image_desc;
3548 module->kernel_count = kernel_count;
3549 module->heap = NULL;
3550 module->constructors_run_p = false;
3552 kernel = &module->kernels[0];
3554 /* Allocate memory for kernel dependencies. */
3555 for (unsigned i = 0; i < kernel_count; i++)
3557 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3558 if (!init_basic_kernel_info (kernel, d, agent, module))
3559 return -1;
3560 if (strcmp (d->name, "_init_array") == 0)
3561 module->init_array_func = kernel;
3562 else if (strcmp (d->name, "_fini_array") == 0)
3563 module->fini_array_func = kernel;
3564 else
3566 pair->start = (uintptr_t) kernel;
3567 pair->end = (uintptr_t) (kernel + 1);
3568 pair++;
3570 kernel++;
3573 agent->module = module;
3574 if (pthread_rwlock_unlock (&agent->module_rwlock))
3576 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3577 return -1;
3580 if (!create_and_finalize_hsa_program (agent))
3581 return -1;
3583 if (var_count > 0)
3585 hsa_status_t status;
3586 hsa_executable_symbol_t var_symbol;
3587 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3588 ".offload_var_table",
3589 agent->id,
3590 0, &var_symbol);
3592 if (status != HSA_STATUS_SUCCESS)
3593 hsa_fatal ("Could not find symbol for variable in the code object",
3594 status);
3596 uint64_t var_table_addr;
3597 status = hsa_fns.hsa_executable_symbol_get_info_fn
3598 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3599 &var_table_addr);
3600 if (status != HSA_STATUS_SUCCESS)
3601 hsa_fatal ("Could not extract a variable from its symbol", status);
3603 struct {
3604 uint64_t addr;
3605 uint64_t size;
3606 } var_table[var_count];
3607 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3608 (void*)var_table_addr, sizeof (var_table));
3610 for (unsigned i = 0; i < var_count; i++)
3612 pair->start = var_table[i].addr;
3613 pair->end = var_table[i].addr + var_table[i].size;
3614 GCN_DEBUG ("Found variable at %p with size %lu\n",
3615 (void *)var_table[i].addr, var_table[i].size);
3616 pair++;
3620 if (ind_func_count > 0)
3622 hsa_status_t status;
3624 /* Read indirect function table from image. */
3625 hsa_executable_symbol_t ind_funcs_symbol;
3626 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3627 ".offload_ind_func_table",
3628 agent->id,
3629 0, &ind_funcs_symbol);
3631 if (status != HSA_STATUS_SUCCESS)
3632 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3633 "code object", status);
3635 uint64_t ind_funcs_table_addr;
3636 status = hsa_fns.hsa_executable_symbol_get_info_fn
3637 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3638 &ind_funcs_table_addr);
3639 if (status != HSA_STATUS_SUCCESS)
3640 hsa_fatal ("Could not extract a variable from its symbol", status);
3642 uint64_t ind_funcs_table[ind_func_count];
3643 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3644 (void*) ind_funcs_table_addr,
3645 sizeof (ind_funcs_table));
3647 /* Build host->target address map for indirect functions. */
3648 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3649 for (unsigned i = 0; i < ind_func_count; i++)
3651 ind_fn_map[i * 2] = host_ind_fn_table[i];
3652 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3653 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3654 i, host_ind_fn_table[i], ind_funcs_table[i]);
3656 ind_fn_map[ind_func_count * 2] = 0;
3658 /* Write the map onto the target. */
3659 void *map_target_addr
3660 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3661 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3663 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3664 (void*) ind_fn_map,
3665 sizeof (ind_fn_map));
3667 /* Write address of the map onto the target. */
3668 hsa_executable_symbol_t symbol;
3670 status
3671 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3672 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3673 agent->id, 0, &symbol);
3674 if (status != HSA_STATUS_SUCCESS)
3675 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3676 status);
3678 uint64_t varptr;
3679 uint32_t varsize;
3681 status = hsa_fns.hsa_executable_symbol_get_info_fn
3682 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3683 &varptr);
3684 if (status != HSA_STATUS_SUCCESS)
3685 hsa_fatal ("Could not extract a variable from its symbol", status);
3686 status = hsa_fns.hsa_executable_symbol_get_info_fn
3687 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3688 &varsize);
3689 if (status != HSA_STATUS_SUCCESS)
3690 hsa_fatal ("Could not extract a variable size from its symbol",
3691 status);
3693 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3694 varptr, varsize);
3696 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3697 &map_target_addr,
3698 sizeof (map_target_addr));
3701 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3703 hsa_status_t status;
3704 hsa_executable_symbol_t var_symbol;
3705 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3706 XSTRING (GOMP_ADDITIONAL_ICVS),
3707 agent->id, 0, &var_symbol);
3708 if (status == HSA_STATUS_SUCCESS)
3710 uint64_t varptr;
3711 uint32_t varsize;
3713 status = hsa_fns.hsa_executable_symbol_get_info_fn
3714 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3715 &varptr);
3716 if (status != HSA_STATUS_SUCCESS)
3717 hsa_fatal ("Could not extract a variable from its symbol", status);
3718 status = hsa_fns.hsa_executable_symbol_get_info_fn
3719 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3720 &varsize);
3721 if (status != HSA_STATUS_SUCCESS)
3722 hsa_fatal ("Could not extract a variable size from its symbol",
3723 status);
3725 pair->start = varptr;
3726 pair->end = varptr + varsize;
3728 else
3730 /* The variable was not in this image. */
3731 GCN_DEBUG ("Variable not found in image: %s\n",
3732 XSTRING (GOMP_ADDITIONAL_ICVS));
3733 pair->start = pair->end = 0;
3736 /* Ensure that constructors are run first. */
3737 struct GOMP_kernel_launch_attributes kla =
3738 { 3,
3739 /* Grid size. */
3740 { 1, 64, 1 },
3741 /* Work-group size. */
3742 { 1, 64, 1 }
3745 if (module->init_array_func)
3747 init_kernel (module->init_array_func);
3748 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3750 module->constructors_run_p = true;
3752 /* Don't report kernels that libgomp need not know about. */
3753 if (module->init_array_func)
3754 kernel_count--;
3755 if (module->fini_array_func)
3756 kernel_count--;
3758 if (rev_fn_table != NULL && kernel_count == 0)
3759 *rev_fn_table = NULL;
3760 else if (rev_fn_table != NULL)
3762 hsa_status_t status;
3763 hsa_executable_symbol_t var_symbol;
3764 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3765 ".offload_func_table",
3766 agent->id, 0, &var_symbol);
3767 if (status != HSA_STATUS_SUCCESS)
3768 hsa_fatal ("Could not find symbol for variable in the code object",
3769 status);
3770 uint64_t fn_table_addr;
3771 status = hsa_fns.hsa_executable_symbol_get_info_fn
3772 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3773 &fn_table_addr);
3774 if (status != HSA_STATUS_SUCCESS)
3775 hsa_fatal ("Could not extract a variable from its symbol", status);
3776 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3777 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3778 (void*) fn_table_addr,
3779 kernel_count * sizeof (uint64_t));
3782 return kernel_count + var_count + other_count;
3785 /* Unload GCN object-code module described by struct gcn_image_desc in
3786 TARGET_DATA from agent number N. Return TRUE on success. */
3788 bool
3789 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3791 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3793 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3794 " (expected %u, received %u)",
3795 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3796 return false;
3799 struct agent_info *agent;
3800 agent = get_agent_info (n);
3801 if (!agent)
3802 return false;
3804 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3806 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3807 return false;
3810 if (!agent->module || agent->module->image_desc != target_data)
3812 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3813 "loaded before");
3814 return false;
3817 if (!destroy_module (agent->module, true))
3818 return false;
3819 free (agent->module);
3820 agent->module = NULL;
3821 if (!destroy_hsa_program (agent))
3822 return false;
3823 if (pthread_rwlock_unlock (&agent->module_rwlock))
3825 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3826 return false;
3828 return true;
3831 /* Deinitialize all information and status associated with agent number N. We
3832 do not attempt any synchronization, assuming the user and libgomp will not
3833 attempt deinitialization of a device that is in any way being used at the
3834 same time. Return TRUE on success. */
3836 bool
3837 GOMP_OFFLOAD_fini_device (int n)
3839 struct agent_info *agent = get_agent_info (n);
3840 if (!agent)
3841 return false;
3843 if (!agent->initialized)
3844 return true;
3846 if (agent->omp_async_queue)
3848 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3849 agent->omp_async_queue = NULL;
3852 if (agent->module)
3854 if (!destroy_module (agent->module, false))
3855 return false;
3856 free (agent->module);
3857 agent->module = NULL;
3860 if (!destroy_ephemeral_memories (agent))
3861 return false;
3863 if (!destroy_hsa_program (agent))
3864 return false;
3866 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3867 if (status != HSA_STATUS_SUCCESS)
3868 return hsa_error ("Error destroying command queue", status);
3870 if (pthread_mutex_destroy (&agent->prog_mutex))
3872 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3873 return false;
3875 if (pthread_rwlock_destroy (&agent->module_rwlock))
3877 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3878 return false;
3881 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3883 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3884 return false;
3886 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3888 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3889 return false;
3891 agent->initialized = false;
3892 return true;
3895 /* Return true if the HSA runtime can run function FN_PTR. */
3897 bool
3898 GOMP_OFFLOAD_can_run (void *fn_ptr)
3900 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3902 init_kernel (kernel);
3903 if (kernel->initialization_failed)
3904 GOMP_PLUGIN_fatal ("kernel initialization failed");
3906 return true;
3909 /* Allocate memory on device N. */
3911 void *
3912 GOMP_OFFLOAD_alloc (int n, size_t size)
3914 struct agent_info *agent = get_agent_info (n);
3915 return alloc_by_agent (agent, size);
3918 /* Free memory from device N. */
3920 bool
3921 GOMP_OFFLOAD_free (int device, void *ptr)
3923 GCN_DEBUG ("Freeing memory on device %d\n", device);
3925 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3926 if (status != HSA_STATUS_SUCCESS)
3928 hsa_error ("Could not free device memory", status);
3929 return false;
3932 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3933 bool profiling_dispatch_p
3934 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3935 if (profiling_dispatch_p)
3937 acc_prof_info *prof_info = thr->prof_info;
3938 acc_event_info data_event_info;
3939 acc_api_info *api_info = thr->api_info;
3941 prof_info->event_type = acc_ev_free;
3943 data_event_info.data_event.event_type = prof_info->event_type;
3944 data_event_info.data_event.valid_bytes
3945 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3946 data_event_info.data_event.parent_construct
3947 = acc_construct_parallel;
3948 data_event_info.data_event.implicit = 1;
3949 data_event_info.data_event.tool_info = NULL;
3950 data_event_info.data_event.var_name = NULL;
3951 data_event_info.data_event.bytes = 0;
3952 data_event_info.data_event.host_ptr = NULL;
3953 data_event_info.data_event.device_ptr = (void *) ptr;
3955 api_info->device_api = acc_device_api_other;
3957 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3958 api_info);
3961 return true;
3964 /* Copy data from DEVICE to host. */
3966 bool
3967 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3969 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3970 src, dst);
3971 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3972 if (status != HSA_STATUS_SUCCESS)
3973 GOMP_PLUGIN_error ("memory copy failed");
3974 return true;
3977 /* Copy data from host to DEVICE. */
3979 bool
3980 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3982 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3983 device, dst);
3984 hsa_memory_copy_wrapper (dst, src, n);
3985 return true;
3988 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3990 bool
3991 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3993 struct gcn_thread *thread_data = gcn_thread ();
3995 if (thread_data && !async_synchronous_p (thread_data->async))
3997 struct agent_info *agent = get_agent_info (device);
3998 maybe_init_omp_async (agent);
3999 queue_push_copy (agent->omp_async_queue, dst, src, n);
4000 return true;
4003 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
4004 device, src, device, dst);
4005 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
4006 if (status != HSA_STATUS_SUCCESS)
4007 GOMP_PLUGIN_error ("memory copy failed");
4008 return true;
4011 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
4012 measured in bytes. So we have:
4014 dim1_size: number of bytes to copy on innermost dimension ("row")
4015 dim0_len: number of rows to copy
4016 dst: base pointer for destination of copy
4017 dst_offset1_size: innermost row offset (for dest), in bytes
4018 dst_offset0_len: offset, number of rows (for dest)
4019 dst_dim1_size: whole-array dest row length, in bytes (pitch)
4020 src: base pointer for source of copy
4021 src_offset1_size: innermost row offset (for source), in bytes
4022 src_offset0_len: offset, number of rows (for source)
4023 src_dim1_size: whole-array source row length, in bytes (pitch)
4027 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
4028 size_t dim0_len, void *dst, size_t dst_offset1_size,
4029 size_t dst_offset0_len, size_t dst_dim1_size,
4030 const void *src, size_t src_offset1_size,
4031 size_t src_offset0_len, size_t src_dim1_size)
4033 if (!hsa_fns.hsa_amd_memory_lock_fn
4034 || !hsa_fns.hsa_amd_memory_unlock_fn
4035 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4036 return -1;
4038 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4039 out quietly if we have anything oddly-aligned rather than letting the
4040 driver raise an error. */
4041 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4042 return -1;
4044 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4045 return -1;
4047 /* Only handle host to device or device to host transfers here. */
4048 if ((dst_ord == -1 && src_ord == -1)
4049 || (dst_ord != -1 && src_ord != -1))
4050 return -1;
4052 hsa_amd_copy_direction_t dir
4053 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4054 hsa_agent_t copy_agent;
4056 /* We need to pin (lock) host memory before we start the transfer. Try to
4057 lock the minimum size necessary, i.e. using partial first/last rows of the
4058 whole array. Something like this:
4060 rows -->
4061 ..............
4062 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4063 o | ++#######+++++ <- whole row
4064 l | ++#######+++++ <- "
4065 s v ++#######..... <- last row apart from trailing remainder
4066 ..............
4068 We could split very large transfers into several rectangular copies, but
4069 that is unimplemented for now. */
4071 size_t bounded_size_host, first_elem_offset_host;
4072 void *host_ptr;
4073 if (dir == hsaHostToDevice)
4075 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4076 first_elem_offset_host = src_offset0_len * src_dim1_size
4077 + src_offset1_size;
4078 host_ptr = (void *) src;
4079 struct agent_info *agent = get_agent_info (dst_ord);
4080 copy_agent = agent->id;
4082 else
4084 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4085 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4086 + dst_offset1_size;
4087 host_ptr = dst;
4088 struct agent_info *agent = get_agent_info (src_ord);
4089 copy_agent = agent->id;
4092 void *agent_ptr;
4094 hsa_status_t status
4095 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4096 bounded_size_host, NULL, 0, &agent_ptr);
4097 /* We can't lock the host memory: don't give up though, we might still be
4098 able to use the slow path in our caller. So, don't make this an
4099 error. */
4100 if (status != HSA_STATUS_SUCCESS)
4101 return -1;
4103 hsa_pitched_ptr_t dstpp, srcpp;
4104 hsa_dim3_t dst_offsets, src_offsets, ranges;
4106 int retval = 1;
4108 hsa_signal_t completion_signal;
4109 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4110 if (status != HSA_STATUS_SUCCESS)
4112 retval = -1;
4113 goto unlock;
4116 if (dir == hsaHostToDevice)
4118 srcpp.base = agent_ptr - first_elem_offset_host;
4119 dstpp.base = dst;
4121 else
4123 srcpp.base = (void *) src;
4124 dstpp.base = agent_ptr - first_elem_offset_host;
4127 srcpp.pitch = src_dim1_size;
4128 srcpp.slice = 0;
4130 src_offsets.x = src_offset1_size;
4131 src_offsets.y = src_offset0_len;
4132 src_offsets.z = 0;
4134 dstpp.pitch = dst_dim1_size;
4135 dstpp.slice = 0;
4137 dst_offsets.x = dst_offset1_size;
4138 dst_offsets.y = dst_offset0_len;
4139 dst_offsets.z = 0;
4141 ranges.x = dim1_size;
4142 ranges.y = dim0_len;
4143 ranges.z = 1;
4145 status
4146 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4147 &src_offsets, &ranges,
4148 copy_agent, dir, 0, NULL,
4149 completion_signal);
4150 /* If the rectangular copy fails, we might still be able to use the slow
4151 path. We need to unlock the host memory though, so don't return
4152 immediately. */
4153 if (status != HSA_STATUS_SUCCESS)
4154 retval = -1;
4155 else
4156 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4157 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4158 HSA_WAIT_STATE_ACTIVE);
4160 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4162 unlock:
4163 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4164 if (status != HSA_STATUS_SUCCESS)
4165 hsa_fatal ("Could not unlock host memory", status);
4167 return retval;
4170 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4171 measured in bytes. So we have:
4173 dim2_size: number of bytes to copy on innermost dimension ("row")
4174 dim1_len: number of rows per slice to copy
4175 dim0_len: number of slices to copy
4176 dst: base pointer for destination of copy
4177 dst_offset2_size: innermost row offset (for dest), in bytes
4178 dst_offset1_len: offset, number of rows (for dest)
4179 dst_offset0_len: offset, number of slices (for dest)
4180 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4181 dst_dim1_len: whole-array number of rows in slice (for dest)
4182 src: base pointer for source of copy
4183 src_offset2_size: innermost row offset (for source), in bytes
4184 src_offset1_len: offset, number of rows (for source)
4185 src_offset0_len: offset, number of slices (for source)
4186 src_dim2_size: whole-array source row length, in bytes (pitch)
4187 src_dim1_len: whole-array number of rows in slice (for source)
4191 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4192 size_t dim1_len, size_t dim0_len, void *dst,
4193 size_t dst_offset2_size, size_t dst_offset1_len,
4194 size_t dst_offset0_len, size_t dst_dim2_size,
4195 size_t dst_dim1_len, const void *src,
4196 size_t src_offset2_size, size_t src_offset1_len,
4197 size_t src_offset0_len, size_t src_dim2_size,
4198 size_t src_dim1_len)
4200 if (!hsa_fns.hsa_amd_memory_lock_fn
4201 || !hsa_fns.hsa_amd_memory_unlock_fn
4202 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4203 return -1;
4205 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4206 out quietly if we have anything oddly-aligned rather than letting the
4207 driver raise an error. */
4208 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4209 return -1;
4211 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4212 return -1;
4214 /* Only handle host to device or device to host transfers here. */
4215 if ((dst_ord == -1 && src_ord == -1)
4216 || (dst_ord != -1 && src_ord != -1))
4217 return -1;
4219 hsa_amd_copy_direction_t dir
4220 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4221 hsa_agent_t copy_agent;
4223 /* We need to pin (lock) host memory before we start the transfer. Try to
4224 lock the minimum size necessary, i.e. using partial first/last slices of
4225 the whole 3D array. Something like this:
4227 slice 0: slice 1: slice 2:
4228 __________ __________ __________
4229 ^ /+++++++++/ : /+++++++++/ : / /
4230 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4231 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4232 /_________/ : /_________/ : /_________/
4233 row --->
4235 We could split very large transfers into several rectangular copies, but
4236 that is unimplemented for now. */
4238 size_t bounded_size_host, first_elem_offset_host;
4239 void *host_ptr;
4240 if (dir == hsaHostToDevice)
4242 size_t slice_bytes = src_dim2_size * src_dim1_len;
4243 bounded_size_host = slice_bytes * (dim0_len - 1)
4244 + src_dim2_size * (dim1_len - 1)
4245 + dim2_size;
4246 first_elem_offset_host = src_offset0_len * slice_bytes
4247 + src_offset1_len * src_dim2_size
4248 + src_offset2_size;
4249 host_ptr = (void *) src;
4250 struct agent_info *agent = get_agent_info (dst_ord);
4251 copy_agent = agent->id;
4253 else
4255 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4256 bounded_size_host = slice_bytes * (dim0_len - 1)
4257 + dst_dim2_size * (dim1_len - 1)
4258 + dim2_size;
4259 first_elem_offset_host = dst_offset0_len * slice_bytes
4260 + dst_offset1_len * dst_dim2_size
4261 + dst_offset2_size;
4262 host_ptr = dst;
4263 struct agent_info *agent = get_agent_info (src_ord);
4264 copy_agent = agent->id;
4267 void *agent_ptr;
4269 hsa_status_t status
4270 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4271 bounded_size_host, NULL, 0, &agent_ptr);
4272 /* We can't lock the host memory: don't give up though, we might still be
4273 able to use the slow path in our caller (maybe even with iterated memcpy2d
4274 calls). So, don't make this an error. */
4275 if (status != HSA_STATUS_SUCCESS)
4276 return -1;
4278 hsa_pitched_ptr_t dstpp, srcpp;
4279 hsa_dim3_t dst_offsets, src_offsets, ranges;
4281 int retval = 1;
4283 hsa_signal_t completion_signal;
4284 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4285 if (status != HSA_STATUS_SUCCESS)
4287 retval = -1;
4288 goto unlock;
4291 if (dir == hsaHostToDevice)
4293 srcpp.base = agent_ptr - first_elem_offset_host;
4294 dstpp.base = dst;
4296 else
4298 srcpp.base = (void *) src;
4299 dstpp.base = agent_ptr - first_elem_offset_host;
4302 /* Pitch is measured in bytes. */
4303 srcpp.pitch = src_dim2_size;
4304 /* Slice is also measured in bytes (i.e. total per-slice). */
4305 srcpp.slice = src_dim2_size * src_dim1_len;
4307 src_offsets.x = src_offset2_size;
4308 src_offsets.y = src_offset1_len;
4309 src_offsets.z = src_offset0_len;
4311 /* As above. */
4312 dstpp.pitch = dst_dim2_size;
4313 dstpp.slice = dst_dim2_size * dst_dim1_len;
4315 dst_offsets.x = dst_offset2_size;
4316 dst_offsets.y = dst_offset1_len;
4317 dst_offsets.z = dst_offset0_len;
4319 ranges.x = dim2_size;
4320 ranges.y = dim1_len;
4321 ranges.z = dim0_len;
4323 status
4324 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4325 &src_offsets, &ranges,
4326 copy_agent, dir, 0, NULL,
4327 completion_signal);
4328 /* If the rectangular copy fails, we might still be able to use the slow
4329 path. We need to unlock the host memory though, so don't return
4330 immediately. */
4331 if (status != HSA_STATUS_SUCCESS)
4332 retval = -1;
4333 else
4335 hsa_signal_value_t sv
4336 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4337 HSA_SIGNAL_CONDITION_LT, 1,
4338 UINT64_MAX,
4339 HSA_WAIT_STATE_ACTIVE);
4340 if (sv < 0)
4342 GCN_WARNING ("async copy rect failure");
4343 retval = -1;
4347 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4349 unlock:
4350 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4351 if (status != HSA_STATUS_SUCCESS)
4352 hsa_fatal ("Could not unlock host memory", status);
4354 return retval;
4357 /* }}} */
4358 /* {{{ OpenMP Plugin API */
4360 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4361 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4362 to a kernel_info structure, and must have previously been loaded to the
4363 specified device. */
4365 void
4366 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4368 struct agent_info *agent = get_agent_info (device);
4369 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4370 struct GOMP_kernel_launch_attributes def;
4371 struct GOMP_kernel_launch_attributes *kla;
4372 assert (agent == kernel->agent);
4374 /* If we get here then the kernel must be OpenMP. */
4375 kernel->kind = KIND_OPENMP;
4377 if (!parse_target_attributes (args, &def, &kla, agent))
4379 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4380 return;
4382 run_kernel (kernel, vars, kla, NULL, false);
4385 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4386 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4387 GOMP_PLUGIN_target_task_completion when it has finished. */
4389 void
4390 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4391 void **args, void *async_data)
4393 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4394 struct agent_info *agent = get_agent_info (device);
4395 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4396 struct GOMP_kernel_launch_attributes def;
4397 struct GOMP_kernel_launch_attributes *kla;
4398 assert (agent == kernel->agent);
4400 /* If we get here then the kernel must be OpenMP. */
4401 kernel->kind = KIND_OPENMP;
4403 if (!parse_target_attributes (args, &def, &kla, agent))
4405 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4406 return;
4409 maybe_init_omp_async (agent);
4410 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4411 queue_push_callback (agent->omp_async_queue,
4412 GOMP_PLUGIN_target_task_completion, async_data);
4415 /* }}} */
4416 /* {{{ OpenACC Plugin API */
4418 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4419 already-loaded KERNEL. */
4421 void
4422 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4423 size_t mapnum __attribute__((unused)),
4424 void **hostaddrs __attribute__((unused)),
4425 void **devaddrs, unsigned *dims,
4426 void *targ_mem_desc)
4428 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4430 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4433 /* Run an asynchronous OpenACC kernel on the specified queue. */
4435 void
4436 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4437 size_t mapnum __attribute__((unused)),
4438 void **hostaddrs __attribute__((unused)),
4439 void **devaddrs,
4440 unsigned *dims, void *targ_mem_desc,
4441 struct goacc_asyncqueue *aq)
4443 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4445 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4448 /* Create a new asynchronous thread and queue for running future kernels. */
4450 struct goacc_asyncqueue *
4451 GOMP_OFFLOAD_openacc_async_construct (int device)
4453 struct agent_info *agent = get_agent_info (device);
4455 pthread_mutex_lock (&agent->async_queues_mutex);
4457 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4458 aq->agent = get_agent_info (device);
4459 aq->prev = NULL;
4460 aq->next = agent->async_queues;
4461 if (aq->next)
4463 aq->next->prev = aq;
4464 aq->id = aq->next->id + 1;
4466 else
4467 aq->id = 1;
4468 agent->async_queues = aq;
4470 aq->queue_first = 0;
4471 aq->queue_n = 0;
4472 aq->drain_queue_stop = 0;
4474 if (pthread_mutex_init (&aq->mutex, NULL))
4476 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4477 return false;
4479 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4481 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4482 return false;
4484 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4486 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4487 return false;
4490 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4491 ASYNC_QUEUE_SIZE,
4492 HSA_QUEUE_TYPE_MULTI,
4493 hsa_queue_callback, NULL,
4494 UINT32_MAX, UINT32_MAX,
4495 &aq->hsa_queue);
4496 if (status != HSA_STATUS_SUCCESS)
4497 hsa_fatal ("Error creating command queue", status);
4499 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4500 if (err != 0)
4501 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4502 strerror (err));
4503 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4504 aq->id);
4506 pthread_mutex_unlock (&agent->async_queues_mutex);
4508 return aq;
4511 /* Destroy an existing asynchronous thread and queue. Waits for any
4512 currently-running task to complete, but cancels any queued tasks. */
4514 bool
4515 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4517 struct agent_info *agent = aq->agent;
4519 finalize_async_thread (aq);
4521 pthread_mutex_lock (&agent->async_queues_mutex);
4523 int err;
4524 if ((err = pthread_mutex_destroy (&aq->mutex)))
4526 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4527 goto fail;
4529 if (pthread_cond_destroy (&aq->queue_cond_in))
4531 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4532 goto fail;
4534 if (pthread_cond_destroy (&aq->queue_cond_out))
4536 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4537 goto fail;
4539 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4540 if (status != HSA_STATUS_SUCCESS)
4542 hsa_error ("Error destroying command queue", status);
4543 goto fail;
4546 if (aq->prev)
4547 aq->prev->next = aq->next;
4548 if (aq->next)
4549 aq->next->prev = aq->prev;
4550 if (agent->async_queues == aq)
4551 agent->async_queues = aq->next;
4553 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4555 free (aq);
4556 pthread_mutex_unlock (&agent->async_queues_mutex);
4557 return true;
4559 fail:
4560 pthread_mutex_unlock (&agent->async_queues_mutex);
4561 return false;
4564 /* Return true if the specified async queue is currently empty. */
4567 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4569 return queue_empty (aq);
4572 /* Block until the specified queue has executed all its tasks and the
4573 queue is empty. */
4575 bool
4576 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4578 wait_queue (aq);
4579 return true;
4582 /* Add a serialization point across two async queues. Any new tasks added to
4583 AQ2, after this call, will not run until all tasks on AQ1, at the time
4584 of this call, have completed. */
4586 bool
4587 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4588 struct goacc_asyncqueue *aq2)
4590 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4591 scheduled to run on it up to this point. */
4592 if (aq1 != aq2)
4594 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4595 queue_push_asyncwait (aq2, placeholderp);
4597 return true;
4600 /* Add an opaque callback to the given async queue. */
4602 void
4603 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4604 void (*fn) (void *), void *data)
4606 queue_push_callback (aq, fn, data);
4609 /* Queue up an asynchronous data copy from host to DEVICE. */
4611 bool
4612 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4613 size_t n, struct goacc_asyncqueue *aq)
4615 struct agent_info *agent = get_agent_info (device);
4616 assert (agent == aq->agent);
4617 queue_push_copy (aq, dst, src, n);
4618 return true;
4621 /* Queue up an asynchronous data copy from DEVICE to host. */
4623 bool
4624 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4625 size_t n, struct goacc_asyncqueue *aq)
4627 struct agent_info *agent = get_agent_info (device);
4628 assert (agent == aq->agent);
4629 queue_push_copy (aq, dst, src, n);
4630 return true;
4633 union goacc_property_value
4634 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4636 struct agent_info *agent = get_agent_info (device);
4638 union goacc_property_value propval = { .val = 0 };
4640 switch (prop)
4642 case GOACC_PROPERTY_FREE_MEMORY:
4643 /* Not supported. */
4644 break;
4645 case GOACC_PROPERTY_MEMORY:
4647 size_t size;
4648 hsa_region_t region = agent->data_region;
4649 hsa_status_t status =
4650 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4651 if (status == HSA_STATUS_SUCCESS)
4652 propval.val = size;
4653 break;
4655 case GOACC_PROPERTY_NAME:
4656 propval.ptr = agent->name;
4657 break;
4658 case GOACC_PROPERTY_VENDOR:
4659 propval.ptr = agent->vendor_name;
4660 break;
4661 case GOACC_PROPERTY_DRIVER:
4662 propval.ptr = hsa_context.driver_version_s;
4663 break;
4666 return propval;
4669 /* Set up plugin-specific thread-local-data (host-side). */
4671 void *
4672 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4674 struct gcn_thread *thread_data
4675 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4677 thread_data->async = GOMP_ASYNC_SYNC;
4679 return (void *) thread_data;
4682 /* Clean up plugin-specific thread-local-data. */
4684 void
4685 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4687 free (data);
4690 /* }}} */