ada: Further cleanup in finalization machinery
[official-gcc.git] / libgomp / plugin / plugin-gcn.c
blob7f8178c78b7a62c8dc0a7906f1402636da0066c1
1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87 #include <unistd.h>
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
92 static char *
93 secure_getenv (const char *name)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
106 /* }}} */
107 /* {{{ Types */
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111 struct gcn_thread
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
201 /* Structure describing the run-time and grid properties of an HSA kernel
202 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204 struct GOMP_kernel_launch_attributes
206 /* Number of dimensions the workload has. Maximum number is 3. */
207 uint32_t ndim;
208 /* Size of the grid in the three respective dimensions. */
209 uint32_t gdims[3];
210 /* Size of work-groups in the respective dimensions. */
211 uint32_t wdims[3];
214 /* Collection of information needed for a dispatch of a kernel from a
215 kernel. */
217 struct kernel_dispatch
219 struct agent_info *agent;
220 /* Pointer to a command queue associated with a kernel dispatch agent. */
221 void *queue;
222 /* Pointer to a memory space used for kernel arguments passing. */
223 void *kernarg_address;
224 /* Kernel object. */
225 uint64_t object;
226 /* Synchronization signal used for dispatch synchronization. */
227 uint64_t signal;
228 /* Private segment size. */
229 uint32_t private_segment_size;
230 /* Group segment size. */
231 uint32_t group_segment_size;
234 /* Structure of the kernargs segment, supporting console output.
236 This needs to match the definitions in Newlib, and the expectations
237 in libgomp target code. */
239 struct kernargs {
240 struct kernargs_abi abi;
242 /* Output data. */
243 struct output output_data;
246 /* A queue entry for a future asynchronous launch. */
248 struct kernel_launch
250 struct kernel_info *kernel;
251 void *vars;
252 struct GOMP_kernel_launch_attributes kla;
255 /* A queue entry for a future callback. */
257 struct callback
259 void (*fn)(void *);
260 void *data;
263 /* A data struct for the copy_data callback. */
265 struct copy_data
267 void *dst;
268 const void *src;
269 size_t len;
270 struct goacc_asyncqueue *aq;
273 /* A queue entry for a placeholder. These correspond to a wait event. */
275 struct placeholder
277 int executed;
278 pthread_cond_t cond;
279 pthread_mutex_t mutex;
282 /* A queue entry for a wait directive. */
284 struct asyncwait_info
286 struct placeholder *placeholderp;
289 /* Encode the type of an entry in an async queue. */
291 enum entry_type
293 KERNEL_LAUNCH,
294 CALLBACK,
295 ASYNC_WAIT,
296 ASYNC_PLACEHOLDER
299 /* An entry in an async queue. */
301 struct queue_entry
303 enum entry_type type;
304 union {
305 struct kernel_launch launch;
306 struct callback callback;
307 struct asyncwait_info asyncwait;
308 struct placeholder placeholder;
309 } u;
312 /* An async queue header.
314 OpenMP may create one of these.
315 OpenACC may create many. */
317 struct goacc_asyncqueue
319 struct agent_info *agent;
320 hsa_queue_t *hsa_queue;
322 pthread_t thread_drain_queue;
323 pthread_mutex_t mutex;
324 pthread_cond_t queue_cond_in;
325 pthread_cond_t queue_cond_out;
326 struct queue_entry queue[ASYNC_QUEUE_SIZE];
327 int queue_first;
328 int queue_n;
329 int drain_queue_stop;
331 int id;
332 struct goacc_asyncqueue *prev;
333 struct goacc_asyncqueue *next;
336 /* Mkoffload uses this structure to describe a kernel.
338 OpenMP kernel dimensions are passed at runtime.
339 OpenACC kernel dimensions are passed at compile time, here. */
341 struct hsa_kernel_description
343 const char *name;
344 int oacc_dims[3]; /* Only present for GCN kernels. */
345 int sgpr_count;
346 int vpgr_count;
349 /* Mkoffload uses this structure to describe an offload variable. */
351 struct global_var_info
353 const char *name;
354 void *address;
357 /* Mkoffload uses this structure to describe all the kernels in a
358 loadable module. These are passed the libgomp via static constructors. */
360 struct gcn_image_desc
362 struct gcn_image {
363 size_t size;
364 void *image;
365 } *gcn_image;
366 const unsigned kernel_count;
367 struct hsa_kernel_description *kernel_infos;
368 const unsigned ind_func_count;
369 const unsigned global_variable_count;
372 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
373 support.
374 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
376 typedef enum {
377 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
378 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
379 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
380 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
381 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
382 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036
383 } EF_AMDGPU_MACH;
385 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
386 typedef EF_AMDGPU_MACH gcn_isa;
388 /* Description of an HSA GPU agent (device) and the program associated with
389 it. */
391 struct agent_info
393 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
394 hsa_agent_t id;
395 /* The user-visible device number. */
396 int device_id;
397 /* Whether the agent has been initialized. The fields below are usable only
398 if it has been. */
399 bool initialized;
401 /* The instruction set architecture of the device. */
402 gcn_isa device_isa;
403 /* Name of the agent. */
404 char name[64];
405 /* Name of the vendor of the agent. */
406 char vendor_name[64];
407 /* Command queues of the agent. */
408 hsa_queue_t *sync_queue;
409 struct goacc_asyncqueue *async_queues, *omp_async_queue;
410 pthread_mutex_t async_queues_mutex;
412 /* The HSA memory region from which to allocate kernel arguments. */
413 hsa_region_t kernarg_region;
415 /* The HSA memory region from which to allocate device data. */
416 hsa_region_t data_region;
418 /* Allocated ephemeral memories (team arena and stack space). */
419 struct ephemeral_memories_list *ephemeral_memories_list;
420 pthread_mutex_t ephemeral_memories_write_lock;
422 /* Read-write lock that protects kernels which are running or about to be run
423 from interference with loading and unloading of images. Needs to be
424 locked for reading while a kernel is being run, and for writing if the
425 list of modules is manipulated (and thus the HSA program invalidated). */
426 pthread_rwlock_t module_rwlock;
428 /* The module associated with this kernel. */
429 struct module_info *module;
431 /* Mutex enforcing that only one thread will finalize the HSA program. A
432 thread should have locked agent->module_rwlock for reading before
433 acquiring it. */
434 pthread_mutex_t prog_mutex;
435 /* Flag whether the HSA program that consists of all the modules has been
436 finalized. */
437 bool prog_finalized;
438 /* HSA executable - the finalized program that is used to locate kernels. */
439 hsa_executable_t executable;
442 /* Information required to identify, finalize and run any given kernel. */
444 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
446 struct kernel_info
448 /* Name of the kernel, required to locate it within the GCN object-code
449 module. */
450 const char *name;
451 /* The specific agent the kernel has been or will be finalized for and run
452 on. */
453 struct agent_info *agent;
454 /* The specific module where the kernel takes place. */
455 struct module_info *module;
456 /* Information provided by mkoffload associated with the kernel. */
457 struct hsa_kernel_description *description;
458 /* Mutex enforcing that at most once thread ever initializes a kernel for
459 use. A thread should have locked agent->module_rwlock for reading before
460 acquiring it. */
461 pthread_mutex_t init_mutex;
462 /* Flag indicating whether the kernel has been initialized and all fields
463 below it contain valid data. */
464 bool initialized;
465 /* Flag indicating that the kernel has a problem that blocks an execution. */
466 bool initialization_failed;
467 /* The object to be put into the dispatch queue. */
468 uint64_t object;
469 /* Required size of kernel arguments. */
470 uint32_t kernarg_segment_size;
471 /* Required size of group segment. */
472 uint32_t group_segment_size;
473 /* Required size of private segment. */
474 uint32_t private_segment_size;
475 /* Set up for OpenMP or OpenACC? */
476 enum offload_kind kind;
479 /* Information about a particular GCN module, its image and kernels. */
481 struct module_info
483 /* The description with which the program has registered the image. */
484 struct gcn_image_desc *image_desc;
485 /* GCN heap allocation. */
486 struct heap *heap;
487 /* Physical boundaries of the loaded module. */
488 Elf64_Addr phys_address_start;
489 Elf64_Addr phys_address_end;
491 bool constructors_run_p;
492 struct kernel_info *init_array_func, *fini_array_func;
494 /* Number of kernels in this module. */
495 int kernel_count;
496 /* An array of kernel_info structures describing each kernel in this
497 module. */
498 struct kernel_info kernels[];
501 /* A linked list of memory arenas allocated on the device.
502 These are used by OpenMP, as a means to optimize per-team malloc,
503 and for host-accessible stack space. */
505 struct ephemeral_memories_list
507 struct ephemeral_memories_list *next;
509 /* The size is determined by the number of teams and threads. */
510 size_t size;
511 /* The device address allocated memory. */
512 void *address;
513 /* A flag to prevent two asynchronous kernels trying to use the same memory.
514 The mutex is locked until the kernel exits. */
515 pthread_mutex_t in_use;
518 /* Information about the whole HSA environment and all of its agents. */
520 struct hsa_context_info
522 /* Whether the structure has been initialized. */
523 bool initialized;
524 /* Number of usable GPU HSA agents in the system. */
525 int agent_count;
526 /* Array of agent_info structures describing the individual HSA agents. */
527 struct agent_info *agents;
528 /* Driver version string. */
529 char driver_version_s[30];
532 /* }}} */
533 /* {{{ Global variables */
535 /* Information about the whole HSA environment and all of its agents. */
537 static struct hsa_context_info hsa_context;
539 /* HSA runtime functions that are initialized in init_hsa_context. */
541 static struct hsa_runtime_fn_info hsa_fns;
543 /* Heap space, allocated target-side, provided for use of newlib malloc.
544 Each module should have it's own heap allocated.
545 Beware that heap usage increases with OpenMP teams. See also arenas. */
547 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
549 /* Ephemeral memory sizes for each kernel launch. */
551 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
552 static int stack_size = DEFAULT_GCN_STACK_SIZE;
553 static int lowlat_size = -1;
555 /* Flag to decide whether print to stderr information about what is going on.
556 Set in init_debug depending on environment variables. */
558 static bool debug;
560 /* Flag to decide if the runtime should suppress a possible fallback to host
561 execution. */
563 static bool suppress_host_fallback;
565 /* Flag to locate HSA runtime shared library that is dlopened
566 by this plug-in. */
568 static const char *hsa_runtime_lib;
570 /* Flag to decide if the runtime should support also CPU devices (can be
571 a simulator). */
573 static bool support_cpu_devices;
575 /* Runtime dimension overrides. Zero indicates default. */
577 static int override_x_dim = 0;
578 static int override_z_dim = 0;
580 /* }}} */
581 /* {{{ Debug & Diagnostic */
583 /* Print a message to stderr if GCN_DEBUG value is set to true. */
585 #define DEBUG_PRINT(...) \
586 do \
588 if (debug) \
590 fprintf (stderr, __VA_ARGS__); \
593 while (false);
595 /* Flush stderr if GCN_DEBUG value is set to true. */
597 #define DEBUG_FLUSH() \
598 do { \
599 if (debug) \
600 fflush (stderr); \
601 } while (false)
603 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
604 is set to true. */
606 #define DEBUG_LOG(prefix, ...) \
607 do \
609 DEBUG_PRINT (prefix); \
610 DEBUG_PRINT (__VA_ARGS__); \
611 DEBUG_FLUSH (); \
612 } while (false)
614 /* Print a debugging message to stderr. */
616 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
618 /* Print a warning message to stderr. */
620 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
622 /* Print HSA warning STR with an HSA STATUS code. */
624 static void
625 hsa_warn (const char *str, hsa_status_t status)
627 if (!debug)
628 return;
630 const char *hsa_error_msg = "[unknown]";
631 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
633 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
634 hsa_error_msg);
637 /* Report a fatal error STR together with the HSA error corresponding to STATUS
638 and terminate execution of the current process. */
640 static void
641 hsa_fatal (const char *str, hsa_status_t status)
643 const char *hsa_error_msg = "[unknown]";
644 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
645 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
646 hsa_error_msg);
649 /* Like hsa_fatal, except only report error message, and return FALSE
650 for propagating error processing to outside of plugin. */
652 static bool
653 hsa_error (const char *str, hsa_status_t status)
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
658 hsa_error_msg);
659 return false;
662 /* Dump information about the available hardware. */
664 static void
665 dump_hsa_system_info (void)
667 hsa_status_t status;
669 hsa_endianness_t endianness;
670 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
671 &endianness);
672 if (status == HSA_STATUS_SUCCESS)
673 switch (endianness)
675 case HSA_ENDIANNESS_LITTLE:
676 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
677 break;
678 case HSA_ENDIANNESS_BIG:
679 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
680 break;
681 default:
682 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
684 else
685 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
687 uint8_t extensions[128];
688 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
689 &extensions);
690 if (status == HSA_STATUS_SUCCESS)
692 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
693 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
695 else
696 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
699 /* Dump information about the available hardware. */
701 static void
702 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
704 switch (machine_model)
706 case HSA_MACHINE_MODEL_SMALL:
707 GCN_DEBUG ("%s: SMALL\n", s);
708 break;
709 case HSA_MACHINE_MODEL_LARGE:
710 GCN_DEBUG ("%s: LARGE\n", s);
711 break;
712 default:
713 GCN_WARNING ("%s: UNKNOWN\n", s);
714 break;
718 /* Dump information about the available hardware. */
720 static void
721 dump_profile (hsa_profile_t profile, const char *s)
723 switch (profile)
725 case HSA_PROFILE_FULL:
726 GCN_DEBUG ("%s: FULL\n", s);
727 break;
728 case HSA_PROFILE_BASE:
729 GCN_DEBUG ("%s: BASE\n", s);
730 break;
731 default:
732 GCN_WARNING ("%s: UNKNOWN\n", s);
733 break;
737 /* Dump information about a device memory region. */
739 static hsa_status_t
740 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
742 hsa_status_t status;
744 hsa_region_segment_t segment;
745 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
746 &segment);
747 if (status == HSA_STATUS_SUCCESS)
749 if (segment == HSA_REGION_SEGMENT_GLOBAL)
750 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
751 else if (segment == HSA_REGION_SEGMENT_READONLY)
752 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
753 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
754 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
755 else if (segment == HSA_REGION_SEGMENT_GROUP)
756 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
757 else
758 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
760 else
761 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
763 if (segment == HSA_REGION_SEGMENT_GLOBAL)
765 uint32_t flags;
766 status
767 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
768 &flags);
769 if (status == HSA_STATUS_SUCCESS)
771 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
772 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
773 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
774 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
775 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
776 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
778 else
779 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
782 size_t size;
783 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
784 if (status == HSA_STATUS_SUCCESS)
785 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
786 else
787 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
789 status
790 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
791 &size);
792 if (status == HSA_STATUS_SUCCESS)
793 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
794 else
795 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
797 bool alloc_allowed;
798 status
799 = hsa_fns.hsa_region_get_info_fn (region,
800 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
801 &alloc_allowed);
802 if (status == HSA_STATUS_SUCCESS)
803 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
804 else
805 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
807 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
808 return HSA_STATUS_SUCCESS;
810 status
811 = hsa_fns.hsa_region_get_info_fn (region,
812 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
813 &size);
814 if (status == HSA_STATUS_SUCCESS)
815 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
816 else
817 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
819 size_t align;
820 status
821 = hsa_fns.hsa_region_get_info_fn (region,
822 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
823 &align);
824 if (status == HSA_STATUS_SUCCESS)
825 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
826 else
827 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
829 return HSA_STATUS_SUCCESS;
832 /* Dump information about all the device memory regions. */
834 static void
835 dump_hsa_regions (hsa_agent_t agent)
837 hsa_status_t status;
838 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
839 dump_hsa_region,
840 NULL);
841 if (status != HSA_STATUS_SUCCESS)
842 hsa_error ("Dumping hsa regions failed", status);
845 /* Dump information about the available devices. */
847 static hsa_status_t
848 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
850 hsa_status_t status;
852 char buf[64];
853 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
854 &buf);
855 if (status == HSA_STATUS_SUCCESS)
856 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
857 else
858 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
860 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
861 &buf);
862 if (status == HSA_STATUS_SUCCESS)
863 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
864 else
865 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
867 hsa_machine_model_t machine_model;
868 status
869 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
870 &machine_model);
871 if (status == HSA_STATUS_SUCCESS)
872 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
873 else
874 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
876 hsa_profile_t profile;
877 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
878 &profile);
879 if (status == HSA_STATUS_SUCCESS)
880 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
881 else
882 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
884 hsa_device_type_t device_type;
885 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
886 &device_type);
887 if (status == HSA_STATUS_SUCCESS)
889 switch (device_type)
891 case HSA_DEVICE_TYPE_CPU:
892 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
893 break;
894 case HSA_DEVICE_TYPE_GPU:
895 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
896 break;
897 case HSA_DEVICE_TYPE_DSP:
898 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
899 break;
900 default:
901 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
902 break;
905 else
906 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
908 uint32_t cu_count;
909 status = hsa_fns.hsa_agent_get_info_fn
910 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
911 if (status == HSA_STATUS_SUCCESS)
912 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
913 else
914 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
916 uint32_t size;
917 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
918 &size);
919 if (status == HSA_STATUS_SUCCESS)
920 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
921 else
922 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
924 uint32_t max_dim;
925 status = hsa_fns.hsa_agent_get_info_fn (agent,
926 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
927 &max_dim);
928 if (status == HSA_STATUS_SUCCESS)
929 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
930 else
931 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
933 uint32_t max_size;
934 status = hsa_fns.hsa_agent_get_info_fn (agent,
935 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
936 &max_size);
937 if (status == HSA_STATUS_SUCCESS)
938 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
939 else
940 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
942 uint32_t grid_max_dim;
943 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
944 &grid_max_dim);
945 if (status == HSA_STATUS_SUCCESS)
946 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
947 else
948 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
950 uint32_t grid_max_size;
951 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
952 &grid_max_size);
953 if (status == HSA_STATUS_SUCCESS)
954 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
955 else
956 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
958 dump_hsa_regions (agent);
960 return HSA_STATUS_SUCCESS;
963 /* Forward reference. */
965 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
967 /* Helper function for dump_executable_symbols. */
969 static hsa_status_t
970 dump_executable_symbol (hsa_executable_t executable,
971 hsa_executable_symbol_t symbol,
972 void *data __attribute__((unused)))
974 char *name = get_executable_symbol_name (symbol);
976 if (name)
978 GCN_DEBUG ("executable symbol: %s\n", name);
979 free (name);
982 return HSA_STATUS_SUCCESS;
985 /* Dump all global symbol in an executable. */
987 static void
988 dump_executable_symbols (hsa_executable_t executable)
990 hsa_status_t status;
991 status
992 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
993 dump_executable_symbol,
994 NULL);
995 if (status != HSA_STATUS_SUCCESS)
996 hsa_fatal ("Could not dump HSA executable symbols", status);
999 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1001 static void
1002 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1004 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1006 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1007 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1008 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1009 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1010 (void*)kernargs->abi.heap_ptr);
1011 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1012 "", (void*)kernargs->abi.arena_ptr,
1013 kernargs->abi.arena_size_per_team);
1014 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1015 "", (void*)kernargs->abi.stack_ptr,
1016 kernargs->abi.stack_size_per_thread);
1017 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1018 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1019 dispatch->private_segment_size);
1020 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1021 "", dispatch->group_segment_size);
1022 fprintf (stderr, "\n");
1025 /* }}} */
1026 /* {{{ Utility functions */
1028 /* Cast the thread local storage to gcn_thread. */
1030 static inline struct gcn_thread *
1031 gcn_thread (void)
1033 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1036 /* Initialize debug and suppress_host_fallback according to the environment. */
1038 static void
1039 init_environment_variables (void)
1041 if (secure_getenv ("GCN_DEBUG"))
1042 debug = true;
1043 else
1044 debug = false;
1046 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1047 suppress_host_fallback = true;
1048 else
1049 suppress_host_fallback = false;
1051 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1052 if (hsa_runtime_lib == NULL)
1053 hsa_runtime_lib = "libhsa-runtime64.so.1";
1055 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1057 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1058 if (!x)
1059 x = secure_getenv ("GCN_NUM_GANGS");
1060 if (x)
1061 override_x_dim = atoi (x);
1063 const char *z = secure_getenv ("GCN_NUM_THREADS");
1064 if (!z)
1065 z = secure_getenv ("GCN_NUM_WORKERS");
1066 if (z)
1067 override_z_dim = atoi (z);
1069 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1070 if (heap)
1072 size_t tmp = atol (heap);
1073 if (tmp)
1074 gcn_kernel_heap_size = tmp;
1077 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1078 if (arena)
1080 int tmp = atoi (arena);
1081 if (tmp)
1082 team_arena_size = tmp;;
1085 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1086 if (stack)
1088 int tmp = atoi (stack);
1089 if (tmp)
1090 stack_size = tmp;;
1093 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1094 if (lowlat)
1095 lowlat_size = atoi (lowlat);
1098 /* Return malloc'd string with name of SYMBOL. */
1100 static char *
1101 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103 hsa_status_t status;
1104 char *res;
1105 uint32_t len;
1106 const hsa_executable_symbol_info_t info_name_length
1107 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1110 &len);
1111 if (status != HSA_STATUS_SUCCESS)
1113 hsa_error ("Could not get length of symbol name", status);
1114 return NULL;
1117 res = GOMP_PLUGIN_malloc (len + 1);
1119 const hsa_executable_symbol_info_t info_name
1120 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124 if (status != HSA_STATUS_SUCCESS)
1126 hsa_error ("Could not get symbol name", status);
1127 free (res);
1128 return NULL;
1131 res[len] = '\0';
1133 return res;
1136 /* Get the number of GPU Compute Units. */
1138 static int
1139 get_cu_count (struct agent_info *agent)
1141 uint32_t cu_count;
1142 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1143 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1144 if (status == HSA_STATUS_SUCCESS)
1145 return cu_count;
1146 else
1147 return 64; /* The usual number for older devices. */
1150 /* Calculate the maximum grid size for OMP threads / OACC workers.
1151 This depends on the kernel's resource usage levels. */
1153 static int
1154 limit_worker_threads (int threads)
1156 /* FIXME Do something more inteligent here.
1157 GCN can always run 4 threads within a Compute Unit, but
1158 more than that depends on register usage. */
1159 if (threads > 16)
1160 threads = 16;
1161 return threads;
1164 /* This sets the maximum number of teams to twice the number of GPU Compute
1165 Units to avoid memory waste and corresponding memory access faults. */
1167 static int
1168 limit_teams (int teams, struct agent_info *agent)
1170 int max_teams = 2 * get_cu_count (agent);
1171 if (teams > max_teams)
1172 teams = max_teams;
1173 return teams;
1176 /* Parse the target attributes INPUT provided by the compiler and return true
1177 if we should run anything all. If INPUT is NULL, fill DEF with default
1178 values, then store INPUT or DEF into *RESULT.
1180 This is used for OpenMP only. */
1182 static bool
1183 parse_target_attributes (void **input,
1184 struct GOMP_kernel_launch_attributes *def,
1185 struct GOMP_kernel_launch_attributes **result,
1186 struct agent_info *agent)
1188 if (!input)
1189 GOMP_PLUGIN_fatal ("No target arguments provided");
1191 bool grid_attrs_found = false;
1192 bool gcn_dims_found = false;
1193 int gcn_teams = 0;
1194 int gcn_threads = 0;
1195 while (*input)
1197 intptr_t id = (intptr_t) *input++, val;
1199 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1200 val = (intptr_t) *input++;
1201 else
1202 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1204 val = (val > INT_MAX) ? INT_MAX : val;
1206 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1207 && ((id & GOMP_TARGET_ARG_ID_MASK)
1208 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1210 grid_attrs_found = true;
1211 break;
1213 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1214 == GOMP_TARGET_ARG_DEVICE_ALL)
1216 gcn_dims_found = true;
1217 switch (id & GOMP_TARGET_ARG_ID_MASK)
1219 case GOMP_TARGET_ARG_NUM_TEAMS:
1220 gcn_teams = limit_teams (val, agent);
1221 break;
1222 case GOMP_TARGET_ARG_THREAD_LIMIT:
1223 gcn_threads = limit_worker_threads (val);
1224 break;
1225 default:
1231 if (gcn_dims_found)
1233 bool gfx900_workaround_p = false;
1235 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1236 && gcn_threads == 0 && override_z_dim == 0)
1238 gfx900_workaround_p = true;
1239 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1240 "threads to at most 4 per team.\n");
1241 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1242 "GCN_NUM_THREADS=16\n");
1245 /* Ideally, when a dimension isn't explicitly specified, we should
1246 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1247 In practice, we tune for peak performance on BabelStream, which
1248 for OpenACC is currently 32 threads per CU. */
1249 def->ndim = 3;
1250 if (gcn_teams <= 0 && gcn_threads <= 0)
1252 /* Set up a reasonable number of teams and threads. */
1253 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1254 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1255 def->gdims[2] = gcn_threads;
1257 else if (gcn_teams <= 0 && gcn_threads > 0)
1259 /* Auto-scale the number of teams with the number of threads. */
1260 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1261 def->gdims[2] = gcn_threads;
1263 else if (gcn_teams > 0 && gcn_threads <= 0)
1265 int max_threads = gfx900_workaround_p ? 4 : 16;
1267 /* Auto-scale the number of threads with the number of teams. */
1268 def->gdims[0] = gcn_teams;
1269 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1270 if (def->gdims[2] == 0)
1271 def->gdims[2] = 1;
1272 else if (def->gdims[2] > max_threads)
1273 def->gdims[2] = max_threads;
1275 else
1277 def->gdims[0] = gcn_teams;
1278 def->gdims[2] = gcn_threads;
1280 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1281 def->wdims[0] = 1; /* Single team per work-group. */
1282 def->wdims[1] = 64;
1283 def->wdims[2] = 16;
1284 *result = def;
1285 return true;
1287 else if (!grid_attrs_found)
1289 def->ndim = 1;
1290 def->gdims[0] = 1;
1291 def->gdims[1] = 1;
1292 def->gdims[2] = 1;
1293 def->wdims[0] = 1;
1294 def->wdims[1] = 1;
1295 def->wdims[2] = 1;
1296 *result = def;
1297 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1298 return true;
1301 struct GOMP_kernel_launch_attributes *kla;
1302 kla = (struct GOMP_kernel_launch_attributes *) *input;
1303 *result = kla;
1304 if (kla->ndim == 0 || kla->ndim > 3)
1305 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1307 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1308 unsigned i;
1309 for (i = 0; i < kla->ndim; i++)
1311 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1312 kla->gdims[i], kla->wdims[i]);
1313 if (kla->gdims[i] == 0)
1314 return false;
1316 return true;
1319 /* Return the group size given the requested GROUP size, GRID size and number
1320 of grid dimensions NDIM. */
1322 static uint32_t
1323 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1325 if (group == 0)
1327 /* TODO: Provide a default via environment or device characteristics. */
1328 if (ndim == 1)
1329 group = 64;
1330 else if (ndim == 2)
1331 group = 8;
1332 else
1333 group = 4;
1336 if (group > grid)
1337 group = grid;
1338 return group;
1341 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1343 static void
1344 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1346 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1349 /* A never-called callback for the HSA command queues. These signal events
1350 that we don't use, so we trigger an error.
1352 This "queue" is not to be confused with the async queues, below. */
1354 static void
1355 hsa_queue_callback (hsa_status_t status,
1356 hsa_queue_t *queue __attribute__ ((unused)),
1357 void *data __attribute__ ((unused)))
1359 hsa_fatal ("Asynchronous queue error", status);
1362 /* }}} */
1363 /* {{{ HSA initialization */
1365 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1367 static bool
1368 init_hsa_runtime_functions (void)
1370 #define DLSYM_FN(function) \
1371 hsa_fns.function##_fn = dlsym (handle, #function); \
1372 if (hsa_fns.function##_fn == NULL) \
1373 return false;
1374 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1375 if (handle == NULL)
1376 return false;
1378 DLSYM_FN (hsa_status_string)
1379 DLSYM_FN (hsa_system_get_info)
1380 DLSYM_FN (hsa_agent_get_info)
1381 DLSYM_FN (hsa_init)
1382 DLSYM_FN (hsa_iterate_agents)
1383 DLSYM_FN (hsa_region_get_info)
1384 DLSYM_FN (hsa_queue_create)
1385 DLSYM_FN (hsa_agent_iterate_regions)
1386 DLSYM_FN (hsa_executable_destroy)
1387 DLSYM_FN (hsa_executable_create)
1388 DLSYM_FN (hsa_executable_global_variable_define)
1389 DLSYM_FN (hsa_executable_load_code_object)
1390 DLSYM_FN (hsa_executable_freeze)
1391 DLSYM_FN (hsa_signal_create)
1392 DLSYM_FN (hsa_memory_allocate)
1393 DLSYM_FN (hsa_memory_assign_agent)
1394 DLSYM_FN (hsa_memory_copy)
1395 DLSYM_FN (hsa_memory_free)
1396 DLSYM_FN (hsa_signal_destroy)
1397 DLSYM_FN (hsa_executable_get_symbol)
1398 DLSYM_FN (hsa_executable_symbol_get_info)
1399 DLSYM_FN (hsa_executable_iterate_symbols)
1400 DLSYM_FN (hsa_queue_add_write_index_release)
1401 DLSYM_FN (hsa_queue_load_read_index_acquire)
1402 DLSYM_FN (hsa_signal_wait_acquire)
1403 DLSYM_FN (hsa_signal_store_relaxed)
1404 DLSYM_FN (hsa_signal_store_release)
1405 DLSYM_FN (hsa_signal_load_acquire)
1406 DLSYM_FN (hsa_queue_destroy)
1407 DLSYM_FN (hsa_code_object_deserialize)
1408 return true;
1409 #undef DLSYM_FN
1412 /* Return true if the agent is a GPU and can accept of concurrent submissions
1413 from different threads. */
1415 static bool
1416 suitable_hsa_agent_p (hsa_agent_t agent)
1418 hsa_device_type_t device_type;
1419 hsa_status_t status
1420 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1421 &device_type);
1422 if (status != HSA_STATUS_SUCCESS)
1423 return false;
1425 switch (device_type)
1427 case HSA_DEVICE_TYPE_GPU:
1428 break;
1429 case HSA_DEVICE_TYPE_CPU:
1430 if (!support_cpu_devices)
1431 return false;
1432 break;
1433 default:
1434 return false;
1437 uint32_t features = 0;
1438 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1439 &features);
1440 if (status != HSA_STATUS_SUCCESS
1441 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1442 return false;
1443 hsa_queue_type_t queue_type;
1444 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1445 &queue_type);
1446 if (status != HSA_STATUS_SUCCESS
1447 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1448 return false;
1450 return true;
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1454 agent_count in hsa_context. */
1456 static hsa_status_t
1457 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1459 if (suitable_hsa_agent_p (agent))
1460 hsa_context.agent_count++;
1461 return HSA_STATUS_SUCCESS;
1464 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1465 id to the describing structure in the hsa context. The index of the
1466 structure is pointed to by DATA, increment it afterwards. */
1468 static hsa_status_t
1469 assign_agent_ids (hsa_agent_t agent, void *data)
1471 if (suitable_hsa_agent_p (agent))
1473 int *agent_index = (int *) data;
1474 hsa_context.agents[*agent_index].id = agent;
1475 ++*agent_index;
1477 return HSA_STATUS_SUCCESS;
1480 /* Initialize hsa_context if it has not already been done.
1481 Return TRUE on success. */
1483 static bool
1484 init_hsa_context (void)
1486 hsa_status_t status;
1487 int agent_index = 0;
1489 if (hsa_context.initialized)
1490 return true;
1491 init_environment_variables ();
1492 if (!init_hsa_runtime_functions ())
1494 GCN_WARNING ("Run-time could not be dynamically opened\n");
1495 if (suppress_host_fallback)
1496 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1497 return false;
1499 status = hsa_fns.hsa_init_fn ();
1500 if (status != HSA_STATUS_SUCCESS)
1501 return hsa_error ("Run-time could not be initialized", status);
1502 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1504 if (debug)
1505 dump_hsa_system_info ();
1507 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1508 if (status != HSA_STATUS_SUCCESS)
1509 return hsa_error ("GCN GPU devices could not be enumerated", status);
1510 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1512 hsa_context.agents
1513 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1514 * sizeof (struct agent_info));
1515 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1516 if (status != HSA_STATUS_SUCCESS)
1517 return hsa_error ("Scanning compute agents failed", status);
1518 if (agent_index != hsa_context.agent_count)
1520 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1521 return false;
1524 if (debug)
1526 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1531 uint16_t minor, major;
1532 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1533 &minor);
1534 if (status != HSA_STATUS_SUCCESS)
1535 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1536 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1537 &major);
1538 if (status != HSA_STATUS_SUCCESS)
1539 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1541 size_t len = sizeof hsa_context.driver_version_s;
1542 int printed = snprintf (hsa_context.driver_version_s, len,
1543 "HSA Runtime %hu.%hu", (unsigned short int)major,
1544 (unsigned short int)minor);
1545 if (printed >= len)
1546 GCN_WARNING ("HSA runtime version string was truncated."
1547 "Version %hu.%hu is too long.", (unsigned short int)major,
1548 (unsigned short int)minor);
1550 hsa_context.initialized = true;
1551 return true;
1554 /* Verify that hsa_context has already been initialized and return the
1555 agent_info structure describing device number N. Return NULL on error. */
1557 static struct agent_info *
1558 get_agent_info (int n)
1560 if (!hsa_context.initialized)
1562 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1563 return NULL;
1565 if (n >= hsa_context.agent_count)
1567 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1568 return NULL;
1570 if (!hsa_context.agents[n].initialized)
1572 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1573 return NULL;
1575 return &hsa_context.agents[n];
1578 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1580 Selects (breaks at) a suitable region of type KIND. */
1582 static hsa_status_t
1583 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1584 hsa_region_global_flag_t kind)
1586 hsa_status_t status;
1587 hsa_region_segment_t segment;
1589 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1590 &segment);
1591 if (status != HSA_STATUS_SUCCESS)
1592 return status;
1593 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1594 return HSA_STATUS_SUCCESS;
1596 uint32_t flags;
1597 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1598 &flags);
1599 if (status != HSA_STATUS_SUCCESS)
1600 return status;
1601 if (flags & kind)
1603 *retval = region;
1604 return HSA_STATUS_INFO_BREAK;
1606 return HSA_STATUS_SUCCESS;
1609 /* Callback of hsa_agent_iterate_regions.
1611 Selects a kernargs memory region. */
1613 static hsa_status_t
1614 get_kernarg_memory_region (hsa_region_t region, void *data)
1616 return get_memory_region (region, (hsa_region_t *)data,
1617 HSA_REGION_GLOBAL_FLAG_KERNARG);
1620 /* Callback of hsa_agent_iterate_regions.
1622 Selects a coarse-grained memory region suitable for the heap and
1623 offload data. */
1625 static hsa_status_t
1626 get_data_memory_region (hsa_region_t region, void *data)
1628 return get_memory_region (region, (hsa_region_t *)data,
1629 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1632 static int
1633 elf_gcn_isa_field (Elf64_Ehdr *image)
1635 return image->e_flags & EF_AMDGPU_MACH_MASK;
1638 const static char *gcn_gfx803_s = "gfx803";
1639 const static char *gcn_gfx900_s = "gfx900";
1640 const static char *gcn_gfx906_s = "gfx906";
1641 const static char *gcn_gfx908_s = "gfx908";
1642 const static char *gcn_gfx90a_s = "gfx90a";
1643 const static char *gcn_gfx1030_s = "gfx1030";
1644 const static int gcn_isa_name_len = 6;
1646 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1647 support the ISA. */
1649 static const char*
1650 isa_hsa_name (int isa) {
1651 switch(isa)
1653 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1654 return gcn_gfx803_s;
1655 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1656 return gcn_gfx900_s;
1657 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1658 return gcn_gfx906_s;
1659 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1660 return gcn_gfx908_s;
1661 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1662 return gcn_gfx90a_s;
1663 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1664 return gcn_gfx1030_s;
1666 return NULL;
1669 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1670 with -march) or NULL if we do not support the ISA.
1671 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1673 static const char*
1674 isa_gcc_name (int isa) {
1675 switch(isa)
1677 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1678 return "fiji";
1679 default:
1680 return isa_hsa_name (isa);
1684 /* Returns the code which is used in the GCN object code to identify the ISA with
1685 the given name (as used by the HSA runtime). */
1687 static gcn_isa
1688 isa_code(const char *isa) {
1689 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1690 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1692 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1693 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1695 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1696 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1698 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1699 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1701 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1702 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1704 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1705 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1707 return -1;
1710 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1712 static int
1713 max_isa_vgprs (int isa)
1715 switch (isa)
1717 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1718 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1719 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1720 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1721 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1722 return 256;
1723 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1724 return 512;
1726 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1729 /* }}} */
1730 /* {{{ Run */
1732 /* Create or reuse a team arena and stack space.
1734 Team arenas are used by OpenMP to avoid calling malloc multiple times
1735 while setting up each team. This is purely a performance optimization.
1737 The stack space is used by all kernels. We must allocate it in such a
1738 way that the reverse offload implmentation can access the data.
1740 Allocating this memory costs performance, so this function will reuse an
1741 existing allocation if a large enough one is idle.
1742 The memory lock is released, but not deallocated, when the kernel exits. */
1744 static void
1745 configure_ephemeral_memories (struct kernel_info *kernel,
1746 struct kernargs_abi *kernargs, int num_teams,
1747 int num_threads)
1749 struct agent_info *agent = kernel->agent;
1750 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1751 struct ephemeral_memories_list *item;
1753 int actual_arena_size = (kernel->kind == KIND_OPENMP
1754 ? team_arena_size : 0);
1755 int actual_arena_total_size = actual_arena_size * num_teams;
1756 size_t size = (actual_arena_total_size
1757 + num_teams * num_threads * stack_size);
1759 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1761 if (item->size < size)
1762 continue;
1764 if (pthread_mutex_trylock (&item->in_use) == 0)
1765 break;
1768 if (!item)
1770 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1771 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1772 num_teams, num_threads, size);
1774 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1776 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1777 return;
1779 item = malloc (sizeof (*item));
1780 item->size = size;
1781 item->next = NULL;
1782 *next_ptr = item;
1784 if (pthread_mutex_init (&item->in_use, NULL))
1786 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1787 return;
1789 if (pthread_mutex_lock (&item->in_use))
1791 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1792 return;
1794 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1796 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1797 return;
1800 hsa_status_t status;
1801 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1802 &item->address);
1803 if (status != HSA_STATUS_SUCCESS)
1804 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1805 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1806 HSA_ACCESS_PERMISSION_RW);
1807 if (status != HSA_STATUS_SUCCESS)
1808 hsa_fatal ("Could not assign arena & stack memory to device", status);
1811 kernargs->arena_ptr = (actual_arena_total_size
1812 ? (uint64_t)item->address
1813 : 0);
1814 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1815 kernargs->arena_size_per_team = actual_arena_size;
1816 kernargs->stack_size_per_thread = stack_size;
1819 /* Mark an ephemeral memory space available for reuse. */
1821 static void
1822 release_ephemeral_memories (struct agent_info* agent, void *address)
1824 struct ephemeral_memories_list *item;
1826 for (item = agent->ephemeral_memories_list; item; item = item->next)
1828 if (item->address == address)
1830 if (pthread_mutex_unlock (&item->in_use))
1831 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1832 return;
1835 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1838 /* Clean up all the allocated team arenas. */
1840 static bool
1841 destroy_ephemeral_memories (struct agent_info *agent)
1843 struct ephemeral_memories_list *item, *next;
1845 for (item = agent->ephemeral_memories_list; item; item = next)
1847 next = item->next;
1848 hsa_fns.hsa_memory_free_fn (item->address);
1849 if (pthread_mutex_destroy (&item->in_use))
1851 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1852 return false;
1854 free (item);
1856 agent->ephemeral_memories_list = NULL;
1858 return true;
1861 /* Allocate memory on a specified device. */
1863 static void *
1864 alloc_by_agent (struct agent_info *agent, size_t size)
1866 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1868 void *ptr;
1869 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1870 size, &ptr);
1871 if (status != HSA_STATUS_SUCCESS)
1873 hsa_error ("Could not allocate device memory", status);
1874 return NULL;
1877 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1878 HSA_ACCESS_PERMISSION_RW);
1879 if (status != HSA_STATUS_SUCCESS)
1881 hsa_error ("Could not assign data memory to device", status);
1882 return NULL;
1885 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1886 bool profiling_dispatch_p
1887 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1888 if (profiling_dispatch_p)
1890 acc_prof_info *prof_info = thr->prof_info;
1891 acc_event_info data_event_info;
1892 acc_api_info *api_info = thr->api_info;
1894 prof_info->event_type = acc_ev_alloc;
1896 data_event_info.data_event.event_type = prof_info->event_type;
1897 data_event_info.data_event.valid_bytes
1898 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1899 data_event_info.data_event.parent_construct
1900 = acc_construct_parallel;
1901 data_event_info.data_event.implicit = 1;
1902 data_event_info.data_event.tool_info = NULL;
1903 data_event_info.data_event.var_name = NULL;
1904 data_event_info.data_event.bytes = size;
1905 data_event_info.data_event.host_ptr = NULL;
1906 data_event_info.data_event.device_ptr = (void *) ptr;
1908 api_info->device_api = acc_device_api_other;
1910 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1911 api_info);
1914 return ptr;
1917 /* Create kernel dispatch data structure for given KERNEL, along with
1918 the necessary device signals and memory allocations. */
1920 static struct kernel_dispatch *
1921 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1922 int num_threads)
1924 struct agent_info *agent = kernel->agent;
1925 struct kernel_dispatch *shadow
1926 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1928 shadow->agent = kernel->agent;
1929 shadow->object = kernel->object;
1931 hsa_signal_t sync_signal;
1932 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1933 if (status != HSA_STATUS_SUCCESS)
1934 hsa_fatal ("Error creating the GCN sync signal", status);
1936 shadow->signal = sync_signal.handle;
1937 shadow->private_segment_size = kernel->private_segment_size;
1939 if (lowlat_size < 0)
1941 /* Divide the LDS between the number of running teams.
1942 Allocate not less than is defined in the kernel metadata. */
1943 int teams_per_cu = num_teams / get_cu_count (agent);
1944 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1945 shadow->group_segment_size
1946 = (kernel->group_segment_size > LDS_per_team
1947 ? kernel->group_segment_size
1948 : LDS_per_team);;
1950 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1951 /* Ensure that there's space for the OpenMP libgomp data. */
1952 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1953 else
1954 shadow->group_segment_size = (lowlat_size > 65536
1955 ? 65536
1956 : lowlat_size);
1958 /* We expect kernels to request a single pointer, explicitly, and the
1959 rest of struct kernargs, implicitly. If they request anything else
1960 then something is wrong. */
1961 if (kernel->kernarg_segment_size > 8)
1963 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1964 return NULL;
1967 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1968 sizeof (struct kernargs),
1969 &shadow->kernarg_address);
1970 if (status != HSA_STATUS_SUCCESS)
1971 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1972 struct kernargs *kernargs = shadow->kernarg_address;
1974 /* Zero-initialize the output_data (minimum needed). */
1975 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
1976 kernargs->output_data.next_output = 0;
1977 for (unsigned i = 0;
1978 i < (sizeof (kernargs->output_data.queue)
1979 / sizeof (kernargs->output_data.queue[0]));
1980 i++)
1981 kernargs->output_data.queue[i].written = 0;
1982 kernargs->output_data.consumed = 0;
1984 /* Pass in the heap location. */
1985 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
1987 /* Create the ephemeral memory spaces. */
1988 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
1990 /* Ensure we can recognize unset return values. */
1991 kernargs->output_data.return_value = 0xcafe0000;
1993 return shadow;
1996 static void
1997 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1998 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2000 int dev_num = dev_num64;
2001 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2002 NULL);
2005 /* Output any data written to console output from the kernel. It is expected
2006 that this function is polled during kernel execution.
2008 We print all entries from the last item printed to the next entry without
2009 a "written" flag. If the "final" flag is set then it'll continue right to
2010 the end.
2012 The print buffer is circular, but the from and to locations don't wrap when
2013 the buffer does, so the output limit is UINT_MAX. The target blocks on
2014 output when the buffer is full. */
2016 static void
2017 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2018 bool final)
2020 unsigned int limit = (sizeof (kernargs->output_data.queue)
2021 / sizeof (kernargs->output_data.queue[0]));
2023 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2024 __ATOMIC_ACQUIRE);
2025 unsigned int to = kernargs->output_data.next_output;
2027 if (from > to)
2029 /* Overflow. */
2030 if (final)
2031 printf ("GCN print buffer overflowed.\n");
2032 return;
2035 unsigned int i;
2036 for (i = from; i < to; i++)
2038 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2040 if (!data->written && !final)
2041 break;
2043 switch (data->type)
2045 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2046 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2047 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2048 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2049 case 4:
2050 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2051 data->value_u64[2], data->value_u64[3],
2052 data->value_u64[4], data->value_u64[5]);
2053 break;
2054 default: printf ("GCN print buffer error!\n"); break;
2056 data->written = 0;
2057 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2058 __ATOMIC_RELEASE);
2060 fflush (stdout);
2063 /* Release data structure created for a kernel dispatch in SHADOW argument,
2064 and clean up the signal and memory allocations. */
2066 static void
2067 release_kernel_dispatch (struct kernel_dispatch *shadow)
2069 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2071 struct kernargs *kernargs = shadow->kernarg_address;
2072 void *addr = (void *)kernargs->abi.arena_ptr;
2073 if (!addr)
2074 addr = (void *)kernargs->abi.stack_ptr;
2075 release_ephemeral_memories (shadow->agent, addr);
2077 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2079 hsa_signal_t s;
2080 s.handle = shadow->signal;
2081 hsa_fns.hsa_signal_destroy_fn (s);
2083 free (shadow);
2086 /* Extract the properties from a kernel binary. */
2088 static void
2089 init_kernel_properties (struct kernel_info *kernel)
2091 hsa_status_t status;
2092 struct agent_info *agent = kernel->agent;
2093 hsa_executable_symbol_t kernel_symbol;
2094 char *buf = alloca (strlen (kernel->name) + 4);
2095 sprintf (buf, "%s.kd", kernel->name);
2096 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2097 buf, agent->id,
2098 0, &kernel_symbol);
2099 if (status != HSA_STATUS_SUCCESS)
2101 hsa_warn ("Could not find symbol for kernel in the code object", status);
2102 fprintf (stderr, "not found name: '%s'\n", buf);
2103 dump_executable_symbols (agent->executable);
2104 goto failure;
2106 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2107 status = hsa_fns.hsa_executable_symbol_get_info_fn
2108 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2109 if (status != HSA_STATUS_SUCCESS)
2110 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2111 status = hsa_fns.hsa_executable_symbol_get_info_fn
2112 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2113 &kernel->kernarg_segment_size);
2114 if (status != HSA_STATUS_SUCCESS)
2115 hsa_fatal ("Could not get info about kernel argument size", status);
2116 status = hsa_fns.hsa_executable_symbol_get_info_fn
2117 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2118 &kernel->group_segment_size);
2119 if (status != HSA_STATUS_SUCCESS)
2120 hsa_fatal ("Could not get info about kernel group segment size", status);
2121 status = hsa_fns.hsa_executable_symbol_get_info_fn
2122 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2123 &kernel->private_segment_size);
2124 if (status != HSA_STATUS_SUCCESS)
2125 hsa_fatal ("Could not get info about kernel private segment size",
2126 status);
2128 /* The kernel type is not known until something tries to launch it. */
2129 kernel->kind = KIND_UNKNOWN;
2131 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2132 "following segment sizes: \n", kernel->name);
2133 GCN_DEBUG (" group_segment_size: %u\n",
2134 (unsigned) kernel->group_segment_size);
2135 GCN_DEBUG (" private_segment_size: %u\n",
2136 (unsigned) kernel->private_segment_size);
2137 GCN_DEBUG (" kernarg_segment_size: %u\n",
2138 (unsigned) kernel->kernarg_segment_size);
2139 return;
2141 failure:
2142 kernel->initialization_failed = true;
2145 /* Do all the work that is necessary before running KERNEL for the first time.
2146 The function assumes the program has been created, finalized and frozen by
2147 create_and_finalize_hsa_program. */
2149 static void
2150 init_kernel (struct kernel_info *kernel)
2152 if (pthread_mutex_lock (&kernel->init_mutex))
2153 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2154 if (kernel->initialized)
2156 if (pthread_mutex_unlock (&kernel->init_mutex))
2157 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2158 "mutex");
2160 return;
2163 init_kernel_properties (kernel);
2165 if (!kernel->initialization_failed)
2167 GCN_DEBUG ("\n");
2169 kernel->initialized = true;
2171 if (pthread_mutex_unlock (&kernel->init_mutex))
2172 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2173 "mutex");
2176 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2177 launch attributes from KLA.
2179 MODULE_LOCKED indicates that the caller already holds the lock and
2180 run_kernel need not lock it again.
2181 If AQ is NULL then agent->sync_queue will be used. */
2183 static void
2184 run_kernel (struct kernel_info *kernel, void *vars,
2185 struct GOMP_kernel_launch_attributes *kla,
2186 struct goacc_asyncqueue *aq, bool module_locked)
2188 struct agent_info *agent = kernel->agent;
2189 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2190 kernel->description->vpgr_count);
2192 /* Reduce the number of threads/workers if there are insufficient
2193 VGPRs available to run the kernels together. */
2194 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2196 int max_vgprs = max_isa_vgprs (agent->device_isa);
2197 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2198 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2199 if (kla->gdims[2] > max_threads)
2201 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2202 " per team/gang - reducing to %d threads/workers.\n",
2203 kla->gdims[2], max_threads);
2204 kla->gdims[2] = max_threads;
2208 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2209 (aq ? aq->id : 0));
2210 GCN_DEBUG ("GCN launch attribs: gdims:[");
2211 int i;
2212 for (i = 0; i < kla->ndim; ++i)
2214 if (i)
2215 DEBUG_PRINT (", ");
2216 DEBUG_PRINT ("%u", kla->gdims[i]);
2218 DEBUG_PRINT ("], normalized gdims:[");
2219 for (i = 0; i < kla->ndim; ++i)
2221 if (i)
2222 DEBUG_PRINT (", ");
2223 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2225 DEBUG_PRINT ("], wdims:[");
2226 for (i = 0; i < kla->ndim; ++i)
2228 if (i)
2229 DEBUG_PRINT (", ");
2230 DEBUG_PRINT ("%u", kla->wdims[i]);
2232 DEBUG_PRINT ("]\n");
2233 DEBUG_FLUSH ();
2235 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2236 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2238 if (!agent->initialized)
2239 GOMP_PLUGIN_fatal ("Agent must be initialized");
2241 if (!kernel->initialized)
2242 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2244 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2246 uint64_t index
2247 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2248 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2250 /* Wait until the queue is not full before writing the packet. */
2251 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2252 >= command_q->size)
2255 /* Do not allow the dimensions to be overridden when running
2256 constructors or destructors. */
2257 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2258 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2260 hsa_kernel_dispatch_packet_t *packet;
2261 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2262 + index % command_q->size;
2264 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2265 packet->grid_size_x = override_x ? : kla->gdims[0];
2266 packet->workgroup_size_x = get_group_size (kla->ndim,
2267 packet->grid_size_x,
2268 kla->wdims[0]);
2270 if (kla->ndim >= 2)
2272 packet->grid_size_y = kla->gdims[1];
2273 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2274 kla->wdims[1]);
2276 else
2278 packet->grid_size_y = 1;
2279 packet->workgroup_size_y = 1;
2282 if (kla->ndim == 3)
2284 packet->grid_size_z = limit_worker_threads (override_z
2285 ? : kla->gdims[2]);
2286 packet->workgroup_size_z = get_group_size (kla->ndim,
2287 packet->grid_size_z,
2288 kla->wdims[2]);
2290 else
2292 packet->grid_size_z = 1;
2293 packet->workgroup_size_z = 1;
2296 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2297 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2298 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2299 packet->grid_size_x / packet->workgroup_size_x,
2300 packet->grid_size_y / packet->workgroup_size_y,
2301 packet->grid_size_z / packet->workgroup_size_z,
2302 packet->workgroup_size_x, packet->workgroup_size_y,
2303 packet->workgroup_size_z);
2305 struct kernel_dispatch *shadow
2306 = create_kernel_dispatch (kernel, packet->grid_size_x,
2307 packet->grid_size_z);
2308 shadow->queue = command_q;
2310 if (debug)
2312 fprintf (stderr, "\nKernel has following dependencies:\n");
2313 print_kernel_dispatch (shadow, 2);
2316 packet->private_segment_size = shadow->private_segment_size;
2317 packet->group_segment_size = shadow->group_segment_size;
2318 packet->kernel_object = shadow->object;
2319 packet->kernarg_address = shadow->kernarg_address;
2320 hsa_signal_t s;
2321 s.handle = shadow->signal;
2322 packet->completion_signal = s;
2323 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2324 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2326 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2328 uint16_t header;
2329 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2330 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2331 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2333 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2334 agent->device_id);
2336 packet_store_release ((uint32_t *) packet, header,
2337 (uint16_t) kla->ndim
2338 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2340 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2341 index);
2343 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2345 /* Root signal waits with 1ms timeout. */
2346 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2347 1000 * 1000,
2348 HSA_WAIT_STATE_BLOCKED) != 0)
2350 console_output (kernel, shadow->kernarg_address, false);
2352 console_output (kernel, shadow->kernarg_address, true);
2354 struct kernargs *kernargs = shadow->kernarg_address;
2355 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2357 release_kernel_dispatch (shadow);
2359 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2360 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2362 unsigned int upper = (return_value & ~0xffff) >> 16;
2363 if (upper == 0xcafe)
2364 ; // exit not called, normal termination.
2365 else if (upper == 0xffff)
2366 ; // exit called.
2367 else
2369 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2370 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2371 return_value);
2372 abort ();
2375 if (upper == 0xffff)
2377 unsigned int signal = (return_value >> 8) & 0xff;
2379 if (signal == SIGABRT)
2381 GCN_WARNING ("GCN Kernel aborted\n");
2382 abort ();
2384 else if (signal != 0)
2386 GCN_WARNING ("GCN Kernel received unknown signal\n");
2387 abort ();
2390 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2391 exit (return_value & 0xff);
2395 /* }}} */
2396 /* {{{ Load/Unload */
2398 /* Initialize KERNEL from D and other parameters. Return true on success. */
2400 static bool
2401 init_basic_kernel_info (struct kernel_info *kernel,
2402 struct hsa_kernel_description *d,
2403 struct agent_info *agent,
2404 struct module_info *module)
2406 kernel->agent = agent;
2407 kernel->module = module;
2408 kernel->name = d->name;
2409 kernel->description = d;
2410 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2412 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2413 return false;
2415 return true;
2418 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2420 static bool
2421 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2423 int isa_field = elf_gcn_isa_field (image);
2424 const char* isa_s = isa_hsa_name (isa_field);
2425 if (!isa_s)
2427 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2428 return false;
2431 if (isa_field != agent->device_isa)
2433 char msg[120];
2434 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2435 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2436 assert (agent_isa_s);
2437 assert (agent_isa_gcc_s);
2439 snprintf (msg, sizeof msg,
2440 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2441 "Try to recompile with '-foffload-options=-march=%s'.\n",
2442 isa_s, agent_isa_s, agent_isa_gcc_s);
2444 hsa_error (msg, HSA_STATUS_ERROR);
2445 return false;
2448 return true;
2451 /* Create and finalize the program consisting of all loaded modules. */
2453 static bool
2454 create_and_finalize_hsa_program (struct agent_info *agent)
2456 hsa_status_t status;
2457 bool res = true;
2458 if (pthread_mutex_lock (&agent->prog_mutex))
2460 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2461 return false;
2463 if (agent->prog_finalized)
2464 goto final;
2466 status
2467 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2468 HSA_EXECUTABLE_STATE_UNFROZEN,
2469 "", &agent->executable);
2470 if (status != HSA_STATUS_SUCCESS)
2472 hsa_error ("Could not create GCN executable", status);
2473 goto fail;
2476 /* Load any GCN modules. */
2477 struct module_info *module = agent->module;
2478 if (module)
2480 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2482 if (!isa_matches_agent (agent, image))
2483 goto fail;
2485 hsa_code_object_t co = { 0 };
2486 status = hsa_fns.hsa_code_object_deserialize_fn
2487 (module->image_desc->gcn_image->image,
2488 module->image_desc->gcn_image->size,
2489 NULL, &co);
2490 if (status != HSA_STATUS_SUCCESS)
2492 hsa_error ("Could not deserialize GCN code object", status);
2493 goto fail;
2496 status = hsa_fns.hsa_executable_load_code_object_fn
2497 (agent->executable, agent->id, co, "");
2498 if (status != HSA_STATUS_SUCCESS)
2500 hsa_error ("Could not load GCN code object", status);
2501 goto fail;
2504 if (!module->heap)
2506 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2507 gcn_kernel_heap_size,
2508 (void**)&module->heap);
2509 if (status != HSA_STATUS_SUCCESS)
2511 hsa_error ("Could not allocate memory for GCN heap", status);
2512 goto fail;
2515 status = hsa_fns.hsa_memory_assign_agent_fn
2516 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2517 if (status != HSA_STATUS_SUCCESS)
2519 hsa_error ("Could not assign GCN heap memory to device", status);
2520 goto fail;
2523 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2524 &gcn_kernel_heap_size,
2525 sizeof (gcn_kernel_heap_size));
2530 if (debug)
2531 dump_executable_symbols (agent->executable);
2533 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2534 if (status != HSA_STATUS_SUCCESS)
2536 hsa_error ("Could not freeze the GCN executable", status);
2537 goto fail;
2540 final:
2541 agent->prog_finalized = true;
2543 if (pthread_mutex_unlock (&agent->prog_mutex))
2545 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2546 res = false;
2549 return res;
2551 fail:
2552 res = false;
2553 goto final;
2556 /* Free the HSA program in agent and everything associated with it and set
2557 agent->prog_finalized and the initialized flags of all kernels to false.
2558 Return TRUE on success. */
2560 static bool
2561 destroy_hsa_program (struct agent_info *agent)
2563 if (!agent->prog_finalized)
2564 return true;
2566 hsa_status_t status;
2568 GCN_DEBUG ("Destroying the current GCN program.\n");
2570 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2571 if (status != HSA_STATUS_SUCCESS)
2572 return hsa_error ("Could not destroy GCN executable", status);
2574 if (agent->module)
2576 int i;
2577 for (i = 0; i < agent->module->kernel_count; i++)
2578 agent->module->kernels[i].initialized = false;
2580 if (agent->module->heap)
2582 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2583 agent->module->heap = NULL;
2586 agent->prog_finalized = false;
2587 return true;
2590 /* Deinitialize all information associated with MODULE and kernels within
2591 it. Return TRUE on success. */
2593 static bool
2594 destroy_module (struct module_info *module, bool locked)
2596 /* Run destructors before destroying module. */
2597 struct GOMP_kernel_launch_attributes kla =
2598 { 3,
2599 /* Grid size. */
2600 { 1, 64, 1 },
2601 /* Work-group size. */
2602 { 1, 64, 1 }
2605 if (module->fini_array_func)
2607 init_kernel (module->fini_array_func);
2608 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2610 module->constructors_run_p = false;
2612 int i;
2613 for (i = 0; i < module->kernel_count; i++)
2614 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2616 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2617 "mutex");
2618 return false;
2621 return true;
2624 /* }}} */
2625 /* {{{ Async */
2627 /* Callback of dispatch queues to report errors. */
2629 static void
2630 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2632 struct queue_entry *entry = &aq->queue[index];
2634 switch (entry->type)
2636 case KERNEL_LAUNCH:
2637 if (DEBUG_QUEUES)
2638 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2639 aq->agent->device_id, aq->id, index);
2640 run_kernel (entry->u.launch.kernel,
2641 entry->u.launch.vars,
2642 &entry->u.launch.kla, aq, false);
2643 if (DEBUG_QUEUES)
2644 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2645 aq->agent->device_id, aq->id, index);
2646 break;
2648 case CALLBACK:
2649 if (DEBUG_QUEUES)
2650 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2651 aq->agent->device_id, aq->id, index);
2652 entry->u.callback.fn (entry->u.callback.data);
2653 if (DEBUG_QUEUES)
2654 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2655 aq->agent->device_id, aq->id, index);
2656 break;
2658 case ASYNC_WAIT:
2660 /* FIXME: is it safe to access a placeholder that may already have
2661 been executed? */
2662 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2664 if (DEBUG_QUEUES)
2665 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2666 aq->agent->device_id, aq->id, index);
2668 pthread_mutex_lock (&placeholderp->mutex);
2670 while (!placeholderp->executed)
2671 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2673 pthread_mutex_unlock (&placeholderp->mutex);
2675 if (pthread_cond_destroy (&placeholderp->cond))
2676 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2678 if (pthread_mutex_destroy (&placeholderp->mutex))
2679 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2681 if (DEBUG_QUEUES)
2682 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2683 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2685 break;
2687 case ASYNC_PLACEHOLDER:
2688 pthread_mutex_lock (&entry->u.placeholder.mutex);
2689 entry->u.placeholder.executed = 1;
2690 pthread_cond_signal (&entry->u.placeholder.cond);
2691 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2692 break;
2694 default:
2695 GOMP_PLUGIN_fatal ("Unknown queue element");
2699 /* This function is run as a thread to service an async queue in the
2700 background. It runs continuously until the stop flag is set. */
2702 static void *
2703 drain_queue (void *thread_arg)
2705 struct goacc_asyncqueue *aq = thread_arg;
2707 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2709 aq->drain_queue_stop = 2;
2710 return NULL;
2713 pthread_mutex_lock (&aq->mutex);
2715 while (true)
2717 if (aq->drain_queue_stop)
2718 break;
2720 if (aq->queue_n > 0)
2722 pthread_mutex_unlock (&aq->mutex);
2723 execute_queue_entry (aq, aq->queue_first);
2725 pthread_mutex_lock (&aq->mutex);
2726 aq->queue_first = ((aq->queue_first + 1)
2727 % ASYNC_QUEUE_SIZE);
2728 aq->queue_n--;
2730 if (DEBUG_THREAD_SIGNAL)
2731 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2732 aq->agent->device_id, aq->id);
2733 pthread_cond_broadcast (&aq->queue_cond_out);
2734 pthread_mutex_unlock (&aq->mutex);
2736 if (DEBUG_QUEUES)
2737 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2738 aq->id);
2739 pthread_mutex_lock (&aq->mutex);
2741 else
2743 if (DEBUG_THREAD_SLEEP)
2744 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2745 aq->agent->device_id, aq->id);
2746 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2747 if (DEBUG_THREAD_SLEEP)
2748 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2749 aq->agent->device_id, aq->id);
2753 aq->drain_queue_stop = 2;
2754 if (DEBUG_THREAD_SIGNAL)
2755 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2756 aq->agent->device_id, aq->id);
2757 pthread_cond_broadcast (&aq->queue_cond_out);
2758 pthread_mutex_unlock (&aq->mutex);
2760 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2761 return NULL;
2764 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2765 is not usually the case. This is just a debug tool. */
2767 static void
2768 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2770 pthread_mutex_lock (&aq->mutex);
2772 while (aq->queue_n > 0)
2774 execute_queue_entry (aq, aq->queue_first);
2776 aq->queue_first = ((aq->queue_first + 1)
2777 % ASYNC_QUEUE_SIZE);
2778 aq->queue_n--;
2781 pthread_mutex_unlock (&aq->mutex);
2784 /* Block the current thread until an async queue is writable. The aq->mutex
2785 lock should be held on entry, and remains locked on exit. */
2787 static void
2788 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2790 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2792 /* Queue is full. Wait for it to not be full. */
2793 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2794 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2798 /* Request an asynchronous kernel launch on the specified queue. This
2799 may block if the queue is full, but returns without waiting for the
2800 kernel to run. */
2802 static void
2803 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2804 void *vars, struct GOMP_kernel_launch_attributes *kla)
2806 assert (aq->agent == kernel->agent);
2808 pthread_mutex_lock (&aq->mutex);
2810 wait_for_queue_nonfull (aq);
2812 int queue_last = ((aq->queue_first + aq->queue_n)
2813 % ASYNC_QUEUE_SIZE);
2814 if (DEBUG_QUEUES)
2815 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2816 aq->id, queue_last);
2818 aq->queue[queue_last].type = KERNEL_LAUNCH;
2819 aq->queue[queue_last].u.launch.kernel = kernel;
2820 aq->queue[queue_last].u.launch.vars = vars;
2821 aq->queue[queue_last].u.launch.kla = *kla;
2823 aq->queue_n++;
2825 if (DEBUG_THREAD_SIGNAL)
2826 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2827 aq->agent->device_id, aq->id);
2828 pthread_cond_signal (&aq->queue_cond_in);
2830 pthread_mutex_unlock (&aq->mutex);
2833 /* Request an asynchronous callback on the specified queue. The callback
2834 function will be called, with the given opaque data, from the appropriate
2835 async thread, when all previous items on that queue are complete. */
2837 static void
2838 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2839 void *data)
2841 pthread_mutex_lock (&aq->mutex);
2843 wait_for_queue_nonfull (aq);
2845 int queue_last = ((aq->queue_first + aq->queue_n)
2846 % ASYNC_QUEUE_SIZE);
2847 if (DEBUG_QUEUES)
2848 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2849 aq->id, queue_last);
2851 aq->queue[queue_last].type = CALLBACK;
2852 aq->queue[queue_last].u.callback.fn = fn;
2853 aq->queue[queue_last].u.callback.data = data;
2855 aq->queue_n++;
2857 if (DEBUG_THREAD_SIGNAL)
2858 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2859 aq->agent->device_id, aq->id);
2860 pthread_cond_signal (&aq->queue_cond_in);
2862 pthread_mutex_unlock (&aq->mutex);
2865 /* Request that a given async thread wait for another thread (unspecified) to
2866 reach the given placeholder. The wait will occur when all previous entries
2867 on the queue are complete. A placeholder is effectively a kind of signal
2868 which simply sets a flag when encountered in a queue. */
2870 static void
2871 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2872 struct placeholder *placeholderp)
2874 pthread_mutex_lock (&aq->mutex);
2876 wait_for_queue_nonfull (aq);
2878 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2879 if (DEBUG_QUEUES)
2880 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2881 aq->id, queue_last);
2883 aq->queue[queue_last].type = ASYNC_WAIT;
2884 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2886 aq->queue_n++;
2888 if (DEBUG_THREAD_SIGNAL)
2889 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2890 aq->agent->device_id, aq->id);
2891 pthread_cond_signal (&aq->queue_cond_in);
2893 pthread_mutex_unlock (&aq->mutex);
2896 /* Add a placeholder into an async queue. When the async thread reaches the
2897 placeholder it will set the "executed" flag to true and continue.
2898 Another thread may be waiting on this thread reaching the placeholder. */
2900 static struct placeholder *
2901 queue_push_placeholder (struct goacc_asyncqueue *aq)
2903 struct placeholder *placeholderp;
2905 pthread_mutex_lock (&aq->mutex);
2907 wait_for_queue_nonfull (aq);
2909 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2910 if (DEBUG_QUEUES)
2911 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2912 aq->id, queue_last);
2914 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2915 placeholderp = &aq->queue[queue_last].u.placeholder;
2917 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2919 pthread_mutex_unlock (&aq->mutex);
2920 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2923 if (pthread_cond_init (&placeholderp->cond, NULL))
2925 pthread_mutex_unlock (&aq->mutex);
2926 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2929 placeholderp->executed = 0;
2931 aq->queue_n++;
2933 if (DEBUG_THREAD_SIGNAL)
2934 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2935 aq->agent->device_id, aq->id);
2936 pthread_cond_signal (&aq->queue_cond_in);
2938 pthread_mutex_unlock (&aq->mutex);
2940 return placeholderp;
2943 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2945 static void
2946 finalize_async_thread (struct goacc_asyncqueue *aq)
2948 pthread_mutex_lock (&aq->mutex);
2949 if (aq->drain_queue_stop == 2)
2951 pthread_mutex_unlock (&aq->mutex);
2952 return;
2955 aq->drain_queue_stop = 1;
2957 if (DEBUG_THREAD_SIGNAL)
2958 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2959 aq->agent->device_id, aq->id);
2960 pthread_cond_signal (&aq->queue_cond_in);
2962 while (aq->drain_queue_stop != 2)
2964 if (DEBUG_THREAD_SLEEP)
2965 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2966 " to sleep\n", aq->agent->device_id, aq->id);
2967 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2968 if (DEBUG_THREAD_SLEEP)
2969 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2970 aq->agent->device_id, aq->id);
2973 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2974 aq->id);
2975 pthread_mutex_unlock (&aq->mutex);
2977 int err = pthread_join (aq->thread_drain_queue, NULL);
2978 if (err != 0)
2979 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2980 aq->agent->device_id, aq->id, strerror (err));
2981 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2984 /* Set up an async queue for OpenMP. There will be only one. The
2985 implementation simply uses an OpenACC async queue.
2986 FIXME: is this thread-safe if two threads call this function? */
2988 static void
2989 maybe_init_omp_async (struct agent_info *agent)
2991 if (!agent->omp_async_queue)
2992 agent->omp_async_queue
2993 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2996 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2997 copies from read-only pages. */
2999 static void
3000 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3002 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3004 if (status == HSA_STATUS_SUCCESS)
3005 return;
3007 /* It appears that the copy fails if the source data is in a read-only page.
3008 We can't detect that easily, so try copying the data to a temporary buffer
3009 and doing the copy again if we got an error above. */
3011 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3012 "[%p:+%d]\n", (void *) src, (int) len);
3014 void *src_copy = malloc (len);
3015 memcpy (src_copy, src, len);
3016 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3017 free (src_copy);
3018 if (status != HSA_STATUS_SUCCESS)
3019 GOMP_PLUGIN_error ("memory copy failed");
3022 /* Copy data to or from a device. This is intended for use as an async
3023 callback event. */
3025 static void
3026 copy_data (void *data_)
3028 struct copy_data *data = (struct copy_data *)data_;
3029 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3030 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3031 data->dst);
3032 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3033 free (data);
3036 /* Request an asynchronous data copy, to or from a device, on a given queue.
3037 The event will be registered as a callback. */
3039 static void
3040 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3041 size_t len)
3043 if (DEBUG_QUEUES)
3044 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3045 aq->agent->device_id, aq->id, len, src, dst);
3046 struct copy_data *data
3047 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3048 data->dst = dst;
3049 data->src = src;
3050 data->len = len;
3051 data->aq = aq;
3052 queue_push_callback (aq, copy_data, data);
3055 /* Return true if the given queue is currently empty. */
3057 static int
3058 queue_empty (struct goacc_asyncqueue *aq)
3060 pthread_mutex_lock (&aq->mutex);
3061 int res = aq->queue_n == 0 ? 1 : 0;
3062 pthread_mutex_unlock (&aq->mutex);
3064 return res;
3067 /* Wait for a given queue to become empty. This implements an OpenACC wait
3068 directive. */
3070 static void
3071 wait_queue (struct goacc_asyncqueue *aq)
3073 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3075 drain_queue_synchronous (aq);
3076 return;
3079 pthread_mutex_lock (&aq->mutex);
3081 while (aq->queue_n > 0)
3083 if (DEBUG_THREAD_SLEEP)
3084 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3085 aq->agent->device_id, aq->id);
3086 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3087 if (DEBUG_THREAD_SLEEP)
3088 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3089 aq->id);
3092 pthread_mutex_unlock (&aq->mutex);
3093 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3096 /* }}} */
3097 /* {{{ OpenACC support */
3099 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3101 static void
3102 gcn_exec (struct kernel_info *kernel,
3103 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3104 struct goacc_asyncqueue *aq)
3106 if (!GOMP_OFFLOAD_can_run (kernel))
3107 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3109 /* If we get here then this must be an OpenACC kernel. */
3110 kernel->kind = KIND_OPENACC;
3112 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3113 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3115 struct hsa_kernel_description *d
3116 = &kernel->module->image_desc->kernel_infos[i];
3117 if (d->name == kernel->name)
3119 hsa_kernel_desc = d;
3120 break;
3124 /* We may have statically-determined dimensions in
3125 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3126 invocation at runtime in dims[]. We allow static dimensions to take
3127 priority over dynamic dimensions when present (non-zero). */
3128 if (hsa_kernel_desc->oacc_dims[0] > 0)
3129 dims[0] = hsa_kernel_desc->oacc_dims[0];
3130 if (hsa_kernel_desc->oacc_dims[1] > 0)
3131 dims[1] = hsa_kernel_desc->oacc_dims[1];
3132 if (hsa_kernel_desc->oacc_dims[2] > 0)
3133 dims[2] = hsa_kernel_desc->oacc_dims[2];
3135 /* Ideally, when a dimension isn't explicitly specified, we should
3136 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3137 In practice, we tune for peak performance on BabelStream, which
3138 for OpenACC is currently 32 threads per CU. */
3139 if (dims[0] == 0 && dims[1] == 0)
3141 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3142 number. There isn't really a correct answer for this without a clue
3143 about the problem size, so let's do a reasonable number of workers
3144 and gangs. */
3146 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3147 dims[1] = 8; /* Workers. */
3149 else if (dims[0] == 0 && dims[1] > 0)
3151 /* Auto-scale the number of gangs with the requested number of workers. */
3152 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3154 else if (dims[0] > 0 && dims[1] == 0)
3156 /* Auto-scale the number of workers with the requested number of gangs. */
3157 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3158 if (dims[1] == 0)
3159 dims[1] = 1;
3160 if (dims[1] > 16)
3161 dims[1] = 16;
3164 /* The incoming dimensions are expressed in terms of gangs, workers, and
3165 vectors. The HSA dimensions are expressed in terms of "work-items",
3166 which means multiples of vector lanes.
3168 The "grid size" specifies the size of the problem space, and the
3169 "work-group size" specifies how much of that we want a single compute
3170 unit to chew on at once.
3172 The three dimensions do not really correspond to hardware, but the
3173 important thing is that the HSA runtime will launch as many
3174 work-groups as it takes to process the entire grid, and each
3175 work-group will contain as many wave-fronts as it takes to process
3176 the work-items in that group.
3178 Essentially, as long as we set the Y dimension to 64 (the number of
3179 vector lanes in hardware), and the Z group size to the maximum (16),
3180 then we will get the gangs (X) and workers (Z) launched as we expect.
3182 The reason for the apparent reversal of vector and worker dimension
3183 order is to do with the way the run-time distributes work-items across
3184 v1 and v2. */
3185 struct GOMP_kernel_launch_attributes kla =
3187 /* Grid size. */
3188 {dims[0], 64, dims[1]},
3189 /* Work-group size. */
3190 {1, 64, 16}
3193 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3194 acc_prof_info *prof_info = thr->prof_info;
3195 acc_event_info enqueue_launch_event_info;
3196 acc_api_info *api_info = thr->api_info;
3197 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3198 if (profiling_dispatch_p)
3200 prof_info->event_type = acc_ev_enqueue_launch_start;
3202 enqueue_launch_event_info.launch_event.event_type
3203 = prof_info->event_type;
3204 enqueue_launch_event_info.launch_event.valid_bytes
3205 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3206 enqueue_launch_event_info.launch_event.parent_construct
3207 = acc_construct_parallel;
3208 enqueue_launch_event_info.launch_event.implicit = 1;
3209 enqueue_launch_event_info.launch_event.tool_info = NULL;
3210 enqueue_launch_event_info.launch_event.kernel_name
3211 = (char *) kernel->name;
3212 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3213 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3214 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3216 api_info->device_api = acc_device_api_other;
3218 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3219 &enqueue_launch_event_info, api_info);
3222 if (!async)
3223 run_kernel (kernel, devaddrs, &kla, NULL, false);
3224 else
3225 queue_push_launch (aq, kernel, devaddrs, &kla);
3227 if (profiling_dispatch_p)
3229 prof_info->event_type = acc_ev_enqueue_launch_end;
3230 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3231 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3232 &enqueue_launch_event_info,
3233 api_info);
3237 /* }}} */
3238 /* {{{ Generic Plugin API */
3240 /* Return the name of the accelerator, which is "gcn". */
3242 const char *
3243 GOMP_OFFLOAD_get_name (void)
3245 return "gcn";
3248 /* Return the specific capabilities the HSA accelerator have. */
3250 unsigned int
3251 GOMP_OFFLOAD_get_caps (void)
3253 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3254 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3255 | GOMP_OFFLOAD_CAP_OPENACC_200;
3258 /* Identify as GCN accelerator. */
3261 GOMP_OFFLOAD_get_type (void)
3263 return OFFLOAD_TARGET_TYPE_GCN;
3266 /* Return the libgomp version number we're compatible with. There is
3267 no requirement for cross-version compatibility. */
3269 unsigned
3270 GOMP_OFFLOAD_version (void)
3272 return GOMP_VERSION;
3275 /* Return the number of GCN devices on the system. */
3278 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3280 if (!init_hsa_context ())
3281 return 0;
3282 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3283 devices were present. */
3284 if (hsa_context.agent_count > 0
3285 && ((omp_requires_mask
3286 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3287 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3288 return -1;
3289 return hsa_context.agent_count;
3292 /* Initialize device (agent) number N so that it can be used for computation.
3293 Return TRUE on success. */
3295 bool
3296 GOMP_OFFLOAD_init_device (int n)
3298 if (!init_hsa_context ())
3299 return false;
3300 if (n >= hsa_context.agent_count)
3302 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3303 return false;
3305 struct agent_info *agent = &hsa_context.agents[n];
3307 if (agent->initialized)
3308 return true;
3310 agent->device_id = n;
3312 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3314 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3315 return false;
3317 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3319 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3320 return false;
3322 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3324 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3325 return false;
3327 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3329 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3330 return false;
3332 agent->async_queues = NULL;
3333 agent->omp_async_queue = NULL;
3334 agent->ephemeral_memories_list = NULL;
3336 uint32_t queue_size;
3337 hsa_status_t status;
3338 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3339 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3340 &queue_size);
3341 if (status != HSA_STATUS_SUCCESS)
3342 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3343 status);
3345 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3346 &agent->name);
3347 if (status != HSA_STATUS_SUCCESS)
3348 return hsa_error ("Error querying the name of the agent", status);
3350 agent->device_isa = isa_code (agent->name);
3351 if (agent->device_isa < 0)
3352 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3354 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3355 &agent->vendor_name);
3356 if (status != HSA_STATUS_SUCCESS)
3357 return hsa_error ("Error querying the vendor name of the agent", status);
3359 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3360 HSA_QUEUE_TYPE_MULTI,
3361 hsa_queue_callback, NULL, UINT32_MAX,
3362 UINT32_MAX, &agent->sync_queue);
3363 if (status != HSA_STATUS_SUCCESS)
3364 return hsa_error ("Error creating command queue", status);
3366 agent->kernarg_region.handle = (uint64_t) -1;
3367 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3368 get_kernarg_memory_region,
3369 &agent->kernarg_region);
3370 if (status != HSA_STATUS_SUCCESS
3371 && status != HSA_STATUS_INFO_BREAK)
3372 hsa_error ("Scanning memory regions failed", status);
3373 if (agent->kernarg_region.handle == (uint64_t) -1)
3375 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3376 "arguments");
3377 return false;
3379 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3380 dump_hsa_region (agent->kernarg_region, NULL);
3382 agent->data_region.handle = (uint64_t) -1;
3383 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3384 get_data_memory_region,
3385 &agent->data_region);
3386 if (status != HSA_STATUS_SUCCESS
3387 && status != HSA_STATUS_INFO_BREAK)
3388 hsa_error ("Scanning memory regions failed", status);
3389 if (agent->data_region.handle == (uint64_t) -1)
3391 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3392 "data");
3393 return false;
3395 GCN_DEBUG ("Selected device data memory region:\n");
3396 dump_hsa_region (agent->data_region, NULL);
3398 GCN_DEBUG ("GCN agent %d initialized\n", n);
3400 agent->initialized = true;
3401 return true;
3404 /* Load GCN object-code module described by struct gcn_image_desc in
3405 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3406 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3407 contain the on-device addresses of the functions for reverse offload. To be
3408 freed by the caller. */
3411 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3412 struct addr_pair **target_table,
3413 uint64_t **rev_fn_table,
3414 uint64_t *host_ind_fn_table)
3416 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3418 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3419 " (expected %u, received %u)",
3420 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3421 return -1;
3424 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3425 struct agent_info *agent;
3426 struct addr_pair *pair;
3427 struct module_info *module;
3428 struct kernel_info *kernel;
3429 int kernel_count = image_desc->kernel_count;
3430 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3431 ? image_desc->ind_func_count : 0;
3432 unsigned var_count = image_desc->global_variable_count;
3433 /* Currently, "others" is a struct of ICVS. */
3434 int other_count = 1;
3436 agent = get_agent_info (ord);
3437 if (!agent)
3438 return -1;
3440 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3442 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3443 return -1;
3445 if (agent->prog_finalized
3446 && !destroy_hsa_program (agent))
3447 return -1;
3449 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3450 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3451 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3452 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3453 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3454 * sizeof (struct addr_pair));
3455 *target_table = pair;
3456 module = (struct module_info *)
3457 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3458 + kernel_count * sizeof (struct kernel_info));
3459 module->image_desc = image_desc;
3460 module->kernel_count = kernel_count;
3461 module->heap = NULL;
3462 module->constructors_run_p = false;
3464 kernel = &module->kernels[0];
3466 /* Allocate memory for kernel dependencies. */
3467 for (unsigned i = 0; i < kernel_count; i++)
3469 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3470 if (!init_basic_kernel_info (kernel, d, agent, module))
3471 return -1;
3472 if (strcmp (d->name, "_init_array") == 0)
3473 module->init_array_func = kernel;
3474 else if (strcmp (d->name, "_fini_array") == 0)
3475 module->fini_array_func = kernel;
3476 else
3478 pair->start = (uintptr_t) kernel;
3479 pair->end = (uintptr_t) (kernel + 1);
3480 pair++;
3482 kernel++;
3485 agent->module = module;
3486 if (pthread_rwlock_unlock (&agent->module_rwlock))
3488 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3489 return -1;
3492 if (!create_and_finalize_hsa_program (agent))
3493 return -1;
3495 if (var_count > 0)
3497 hsa_status_t status;
3498 hsa_executable_symbol_t var_symbol;
3499 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3500 ".offload_var_table",
3501 agent->id,
3502 0, &var_symbol);
3504 if (status != HSA_STATUS_SUCCESS)
3505 hsa_fatal ("Could not find symbol for variable in the code object",
3506 status);
3508 uint64_t var_table_addr;
3509 status = hsa_fns.hsa_executable_symbol_get_info_fn
3510 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3511 &var_table_addr);
3512 if (status != HSA_STATUS_SUCCESS)
3513 hsa_fatal ("Could not extract a variable from its symbol", status);
3515 struct {
3516 uint64_t addr;
3517 uint64_t size;
3518 } var_table[var_count];
3519 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3520 (void*)var_table_addr, sizeof (var_table));
3522 for (unsigned i = 0; i < var_count; i++)
3524 pair->start = var_table[i].addr;
3525 pair->end = var_table[i].addr + var_table[i].size;
3526 GCN_DEBUG ("Found variable at %p with size %lu\n",
3527 (void *)var_table[i].addr, var_table[i].size);
3528 pair++;
3532 if (ind_func_count > 0)
3534 hsa_status_t status;
3536 /* Read indirect function table from image. */
3537 hsa_executable_symbol_t ind_funcs_symbol;
3538 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3539 ".offload_ind_func_table",
3540 agent->id,
3541 0, &ind_funcs_symbol);
3543 if (status != HSA_STATUS_SUCCESS)
3544 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3545 "code object", status);
3547 uint64_t ind_funcs_table_addr;
3548 status = hsa_fns.hsa_executable_symbol_get_info_fn
3549 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3550 &ind_funcs_table_addr);
3551 if (status != HSA_STATUS_SUCCESS)
3552 hsa_fatal ("Could not extract a variable from its symbol", status);
3554 uint64_t ind_funcs_table[ind_func_count];
3555 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3556 (void*) ind_funcs_table_addr,
3557 sizeof (ind_funcs_table));
3559 /* Build host->target address map for indirect functions. */
3560 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3561 for (unsigned i = 0; i < ind_func_count; i++)
3563 ind_fn_map[i * 2] = host_ind_fn_table[i];
3564 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3565 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3566 i, host_ind_fn_table[i], ind_funcs_table[i]);
3568 ind_fn_map[ind_func_count * 2] = 0;
3570 /* Write the map onto the target. */
3571 void *map_target_addr
3572 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3573 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3575 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3576 (void*) ind_fn_map,
3577 sizeof (ind_fn_map));
3579 /* Write address of the map onto the target. */
3580 hsa_executable_symbol_t symbol;
3582 status
3583 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3584 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3585 agent->id, 0, &symbol);
3586 if (status != HSA_STATUS_SUCCESS)
3587 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3588 status);
3590 uint64_t varptr;
3591 uint32_t varsize;
3593 status = hsa_fns.hsa_executable_symbol_get_info_fn
3594 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3595 &varptr);
3596 if (status != HSA_STATUS_SUCCESS)
3597 hsa_fatal ("Could not extract a variable from its symbol", status);
3598 status = hsa_fns.hsa_executable_symbol_get_info_fn
3599 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3600 &varsize);
3601 if (status != HSA_STATUS_SUCCESS)
3602 hsa_fatal ("Could not extract a variable size from its symbol",
3603 status);
3605 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3606 varptr, varsize);
3608 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3609 &map_target_addr,
3610 sizeof (map_target_addr));
3613 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3615 hsa_status_t status;
3616 hsa_executable_symbol_t var_symbol;
3617 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3618 XSTRING (GOMP_ADDITIONAL_ICVS),
3619 agent->id, 0, &var_symbol);
3620 if (status == HSA_STATUS_SUCCESS)
3622 uint64_t varptr;
3623 uint32_t varsize;
3625 status = hsa_fns.hsa_executable_symbol_get_info_fn
3626 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3627 &varptr);
3628 if (status != HSA_STATUS_SUCCESS)
3629 hsa_fatal ("Could not extract a variable from its symbol", status);
3630 status = hsa_fns.hsa_executable_symbol_get_info_fn
3631 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3632 &varsize);
3633 if (status != HSA_STATUS_SUCCESS)
3634 hsa_fatal ("Could not extract a variable size from its symbol",
3635 status);
3637 pair->start = varptr;
3638 pair->end = varptr + varsize;
3640 else
3642 /* The variable was not in this image. */
3643 GCN_DEBUG ("Variable not found in image: %s\n",
3644 XSTRING (GOMP_ADDITIONAL_ICVS));
3645 pair->start = pair->end = 0;
3648 /* Ensure that constructors are run first. */
3649 struct GOMP_kernel_launch_attributes kla =
3650 { 3,
3651 /* Grid size. */
3652 { 1, 64, 1 },
3653 /* Work-group size. */
3654 { 1, 64, 1 }
3657 if (module->init_array_func)
3659 init_kernel (module->init_array_func);
3660 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3662 module->constructors_run_p = true;
3664 /* Don't report kernels that libgomp need not know about. */
3665 if (module->init_array_func)
3666 kernel_count--;
3667 if (module->fini_array_func)
3668 kernel_count--;
3670 if (rev_fn_table != NULL && kernel_count == 0)
3671 *rev_fn_table = NULL;
3672 else if (rev_fn_table != NULL)
3674 hsa_status_t status;
3675 hsa_executable_symbol_t var_symbol;
3676 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3677 ".offload_func_table",
3678 agent->id, 0, &var_symbol);
3679 if (status != HSA_STATUS_SUCCESS)
3680 hsa_fatal ("Could not find symbol for variable in the code object",
3681 status);
3682 uint64_t fn_table_addr;
3683 status = hsa_fns.hsa_executable_symbol_get_info_fn
3684 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3685 &fn_table_addr);
3686 if (status != HSA_STATUS_SUCCESS)
3687 hsa_fatal ("Could not extract a variable from its symbol", status);
3688 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3689 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3690 (void*) fn_table_addr,
3691 kernel_count * sizeof (uint64_t));
3694 return kernel_count + var_count + other_count;
3697 /* Unload GCN object-code module described by struct gcn_image_desc in
3698 TARGET_DATA from agent number N. Return TRUE on success. */
3700 bool
3701 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3703 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3705 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3706 " (expected %u, received %u)",
3707 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3708 return false;
3711 struct agent_info *agent;
3712 agent = get_agent_info (n);
3713 if (!agent)
3714 return false;
3716 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3718 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3719 return false;
3722 if (!agent->module || agent->module->image_desc != target_data)
3724 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3725 "loaded before");
3726 return false;
3729 if (!destroy_module (agent->module, true))
3730 return false;
3731 free (agent->module);
3732 agent->module = NULL;
3733 if (!destroy_hsa_program (agent))
3734 return false;
3735 if (pthread_rwlock_unlock (&agent->module_rwlock))
3737 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3738 return false;
3740 return true;
3743 /* Deinitialize all information and status associated with agent number N. We
3744 do not attempt any synchronization, assuming the user and libgomp will not
3745 attempt deinitialization of a device that is in any way being used at the
3746 same time. Return TRUE on success. */
3748 bool
3749 GOMP_OFFLOAD_fini_device (int n)
3751 struct agent_info *agent = get_agent_info (n);
3752 if (!agent)
3753 return false;
3755 if (!agent->initialized)
3756 return true;
3758 if (agent->omp_async_queue)
3760 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3761 agent->omp_async_queue = NULL;
3764 if (agent->module)
3766 if (!destroy_module (agent->module, false))
3767 return false;
3768 free (agent->module);
3769 agent->module = NULL;
3772 if (!destroy_ephemeral_memories (agent))
3773 return false;
3775 if (!destroy_hsa_program (agent))
3776 return false;
3778 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3779 if (status != HSA_STATUS_SUCCESS)
3780 return hsa_error ("Error destroying command queue", status);
3782 if (pthread_mutex_destroy (&agent->prog_mutex))
3784 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3785 return false;
3787 if (pthread_rwlock_destroy (&agent->module_rwlock))
3789 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3790 return false;
3793 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3795 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3796 return false;
3798 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3800 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3801 return false;
3803 agent->initialized = false;
3804 return true;
3807 /* Return true if the HSA runtime can run function FN_PTR. */
3809 bool
3810 GOMP_OFFLOAD_can_run (void *fn_ptr)
3812 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3814 init_kernel (kernel);
3815 if (kernel->initialization_failed)
3816 goto failure;
3818 return true;
3820 failure:
3821 if (suppress_host_fallback)
3822 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3823 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3824 return false;
3827 /* Allocate memory on device N. */
3829 void *
3830 GOMP_OFFLOAD_alloc (int n, size_t size)
3832 struct agent_info *agent = get_agent_info (n);
3833 return alloc_by_agent (agent, size);
3836 /* Free memory from device N. */
3838 bool
3839 GOMP_OFFLOAD_free (int device, void *ptr)
3841 GCN_DEBUG ("Freeing memory on device %d\n", device);
3843 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3844 if (status != HSA_STATUS_SUCCESS)
3846 hsa_error ("Could not free device memory", status);
3847 return false;
3850 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3851 bool profiling_dispatch_p
3852 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3853 if (profiling_dispatch_p)
3855 acc_prof_info *prof_info = thr->prof_info;
3856 acc_event_info data_event_info;
3857 acc_api_info *api_info = thr->api_info;
3859 prof_info->event_type = acc_ev_free;
3861 data_event_info.data_event.event_type = prof_info->event_type;
3862 data_event_info.data_event.valid_bytes
3863 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3864 data_event_info.data_event.parent_construct
3865 = acc_construct_parallel;
3866 data_event_info.data_event.implicit = 1;
3867 data_event_info.data_event.tool_info = NULL;
3868 data_event_info.data_event.var_name = NULL;
3869 data_event_info.data_event.bytes = 0;
3870 data_event_info.data_event.host_ptr = NULL;
3871 data_event_info.data_event.device_ptr = (void *) ptr;
3873 api_info->device_api = acc_device_api_other;
3875 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3876 api_info);
3879 return true;
3882 /* Copy data from DEVICE to host. */
3884 bool
3885 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3887 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3888 src, dst);
3889 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3890 if (status != HSA_STATUS_SUCCESS)
3891 GOMP_PLUGIN_error ("memory copy failed");
3892 return true;
3895 /* Copy data from host to DEVICE. */
3897 bool
3898 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3900 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3901 device, dst);
3902 hsa_memory_copy_wrapper (dst, src, n);
3903 return true;
3906 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3908 bool
3909 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3911 struct gcn_thread *thread_data = gcn_thread ();
3913 if (thread_data && !async_synchronous_p (thread_data->async))
3915 struct agent_info *agent = get_agent_info (device);
3916 maybe_init_omp_async (agent);
3917 queue_push_copy (agent->omp_async_queue, dst, src, n);
3918 return true;
3921 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3922 device, src, device, dst);
3923 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3924 if (status != HSA_STATUS_SUCCESS)
3925 GOMP_PLUGIN_error ("memory copy failed");
3926 return true;
3929 /* }}} */
3930 /* {{{ OpenMP Plugin API */
3932 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3933 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3934 to a kernel_info structure, and must have previously been loaded to the
3935 specified device. */
3937 void
3938 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3940 struct agent_info *agent = get_agent_info (device);
3941 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3942 struct GOMP_kernel_launch_attributes def;
3943 struct GOMP_kernel_launch_attributes *kla;
3944 assert (agent == kernel->agent);
3946 /* If we get here then the kernel must be OpenMP. */
3947 kernel->kind = KIND_OPENMP;
3949 if (!parse_target_attributes (args, &def, &kla, agent))
3951 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3952 return;
3954 run_kernel (kernel, vars, kla, NULL, false);
3957 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3958 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3959 GOMP_PLUGIN_target_task_completion when it has finished. */
3961 void
3962 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3963 void **args, void *async_data)
3965 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3966 struct agent_info *agent = get_agent_info (device);
3967 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3968 struct GOMP_kernel_launch_attributes def;
3969 struct GOMP_kernel_launch_attributes *kla;
3970 assert (agent == kernel->agent);
3972 /* If we get here then the kernel must be OpenMP. */
3973 kernel->kind = KIND_OPENMP;
3975 if (!parse_target_attributes (args, &def, &kla, agent))
3977 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3978 return;
3981 maybe_init_omp_async (agent);
3982 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3983 queue_push_callback (agent->omp_async_queue,
3984 GOMP_PLUGIN_target_task_completion, async_data);
3987 /* }}} */
3988 /* {{{ OpenACC Plugin API */
3990 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3991 already-loaded KERNEL. */
3993 void
3994 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
3995 size_t mapnum __attribute__((unused)),
3996 void **hostaddrs __attribute__((unused)),
3997 void **devaddrs, unsigned *dims,
3998 void *targ_mem_desc)
4000 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4002 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4005 /* Run an asynchronous OpenACC kernel on the specified queue. */
4007 void
4008 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4009 size_t mapnum __attribute__((unused)),
4010 void **hostaddrs __attribute__((unused)),
4011 void **devaddrs,
4012 unsigned *dims, void *targ_mem_desc,
4013 struct goacc_asyncqueue *aq)
4015 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4017 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4020 /* Create a new asynchronous thread and queue for running future kernels. */
4022 struct goacc_asyncqueue *
4023 GOMP_OFFLOAD_openacc_async_construct (int device)
4025 struct agent_info *agent = get_agent_info (device);
4027 pthread_mutex_lock (&agent->async_queues_mutex);
4029 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4030 aq->agent = get_agent_info (device);
4031 aq->prev = NULL;
4032 aq->next = agent->async_queues;
4033 if (aq->next)
4035 aq->next->prev = aq;
4036 aq->id = aq->next->id + 1;
4038 else
4039 aq->id = 1;
4040 agent->async_queues = aq;
4042 aq->queue_first = 0;
4043 aq->queue_n = 0;
4044 aq->drain_queue_stop = 0;
4046 if (pthread_mutex_init (&aq->mutex, NULL))
4048 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4049 return false;
4051 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4053 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4054 return false;
4056 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4058 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4059 return false;
4062 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4063 ASYNC_QUEUE_SIZE,
4064 HSA_QUEUE_TYPE_MULTI,
4065 hsa_queue_callback, NULL,
4066 UINT32_MAX, UINT32_MAX,
4067 &aq->hsa_queue);
4068 if (status != HSA_STATUS_SUCCESS)
4069 hsa_fatal ("Error creating command queue", status);
4071 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4072 if (err != 0)
4073 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4074 strerror (err));
4075 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4076 aq->id);
4078 pthread_mutex_unlock (&agent->async_queues_mutex);
4080 return aq;
4083 /* Destroy an existing asynchronous thread and queue. Waits for any
4084 currently-running task to complete, but cancels any queued tasks. */
4086 bool
4087 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4089 struct agent_info *agent = aq->agent;
4091 finalize_async_thread (aq);
4093 pthread_mutex_lock (&agent->async_queues_mutex);
4095 int err;
4096 if ((err = pthread_mutex_destroy (&aq->mutex)))
4098 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4099 goto fail;
4101 if (pthread_cond_destroy (&aq->queue_cond_in))
4103 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4104 goto fail;
4106 if (pthread_cond_destroy (&aq->queue_cond_out))
4108 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4109 goto fail;
4111 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4112 if (status != HSA_STATUS_SUCCESS)
4114 hsa_error ("Error destroying command queue", status);
4115 goto fail;
4118 if (aq->prev)
4119 aq->prev->next = aq->next;
4120 if (aq->next)
4121 aq->next->prev = aq->prev;
4122 if (agent->async_queues == aq)
4123 agent->async_queues = aq->next;
4125 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4127 free (aq);
4128 pthread_mutex_unlock (&agent->async_queues_mutex);
4129 return true;
4131 fail:
4132 pthread_mutex_unlock (&agent->async_queues_mutex);
4133 return false;
4136 /* Return true if the specified async queue is currently empty. */
4139 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4141 return queue_empty (aq);
4144 /* Block until the specified queue has executed all its tasks and the
4145 queue is empty. */
4147 bool
4148 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4150 wait_queue (aq);
4151 return true;
4154 /* Add a serialization point across two async queues. Any new tasks added to
4155 AQ2, after this call, will not run until all tasks on AQ1, at the time
4156 of this call, have completed. */
4158 bool
4159 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4160 struct goacc_asyncqueue *aq2)
4162 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4163 scheduled to run on it up to this point. */
4164 if (aq1 != aq2)
4166 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4167 queue_push_asyncwait (aq2, placeholderp);
4169 return true;
4172 /* Add an opaque callback to the given async queue. */
4174 void
4175 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4176 void (*fn) (void *), void *data)
4178 queue_push_callback (aq, fn, data);
4181 /* Queue up an asynchronous data copy from host to DEVICE. */
4183 bool
4184 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4185 size_t n, struct goacc_asyncqueue *aq)
4187 struct agent_info *agent = get_agent_info (device);
4188 assert (agent == aq->agent);
4189 queue_push_copy (aq, dst, src, n);
4190 return true;
4193 /* Queue up an asynchronous data copy from DEVICE to host. */
4195 bool
4196 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4197 size_t n, struct goacc_asyncqueue *aq)
4199 struct agent_info *agent = get_agent_info (device);
4200 assert (agent == aq->agent);
4201 queue_push_copy (aq, dst, src, n);
4202 return true;
4205 union goacc_property_value
4206 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4208 struct agent_info *agent = get_agent_info (device);
4210 union goacc_property_value propval = { .val = 0 };
4212 switch (prop)
4214 case GOACC_PROPERTY_FREE_MEMORY:
4215 /* Not supported. */
4216 break;
4217 case GOACC_PROPERTY_MEMORY:
4219 size_t size;
4220 hsa_region_t region = agent->data_region;
4221 hsa_status_t status =
4222 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4223 if (status == HSA_STATUS_SUCCESS)
4224 propval.val = size;
4225 break;
4227 case GOACC_PROPERTY_NAME:
4228 propval.ptr = agent->name;
4229 break;
4230 case GOACC_PROPERTY_VENDOR:
4231 propval.ptr = agent->vendor_name;
4232 break;
4233 case GOACC_PROPERTY_DRIVER:
4234 propval.ptr = hsa_context.driver_version_s;
4235 break;
4238 return propval;
4241 /* Set up plugin-specific thread-local-data (host-side). */
4243 void *
4244 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4246 struct gcn_thread *thread_data
4247 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4249 thread_data->async = GOMP_ASYNC_SYNC;
4251 return (void *) thread_data;
4254 /* Clean up plugin-specific thread-local-data. */
4256 void
4257 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4259 free (data);
4262 /* }}} */