Fix gcc.c-torture/execute/ieee/cdivchkf.c on hpux
[official-gcc.git] / libgomp / plugin / plugin-gcn.c
blob2771123252a87389febf5b421d21f84f89cd8267
1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87 #include <unistd.h>
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
92 static char *
93 secure_getenv (const char *name)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
106 /* }}} */
107 /* {{{ Types */
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111 struct gcn_thread
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
224 /* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
227 struct kernel_dispatch
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
249 struct kernargs {
250 struct kernargs_abi abi;
252 /* Output data. */
253 struct output output_data;
256 /* A queue entry for a future asynchronous launch. */
258 struct kernel_launch
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
265 /* A queue entry for a future callback. */
267 struct callback
269 void (*fn)(void *);
270 void *data;
273 /* A data struct for the copy_data callback. */
275 struct copy_data
277 void *dst;
278 const void *src;
279 size_t len;
280 struct goacc_asyncqueue *aq;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
285 struct placeholder
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder *placeholderp;
299 /* Encode the type of an entry in an async queue. */
301 enum entry_type
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
309 /* An entry in an async queue. */
311 struct queue_entry
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
355 int sgpr_count;
356 int vpgr_count;
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
363 const char *name;
364 void *address;
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
378 const unsigned ind_func_count;
379 const unsigned global_variable_count;
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
386 typedef enum {
387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
393 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
394 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
395 } EF_AMDGPU_MACH;
397 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
398 typedef EF_AMDGPU_MACH gcn_isa;
400 /* Description of an HSA GPU agent (device) and the program associated with
401 it. */
403 struct agent_info
405 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
406 hsa_agent_t id;
407 /* The user-visible device number. */
408 int device_id;
409 /* Whether the agent has been initialized. The fields below are usable only
410 if it has been. */
411 bool initialized;
413 /* The instruction set architecture of the device. */
414 gcn_isa device_isa;
415 /* Name of the agent. */
416 char name[64];
417 /* Name of the vendor of the agent. */
418 char vendor_name[64];
419 /* Command queues of the agent. */
420 hsa_queue_t *sync_queue;
421 struct goacc_asyncqueue *async_queues, *omp_async_queue;
422 pthread_mutex_t async_queues_mutex;
424 /* The HSA memory region from which to allocate kernel arguments. */
425 hsa_region_t kernarg_region;
427 /* The HSA memory region from which to allocate device data. */
428 hsa_region_t data_region;
430 /* Allocated ephemeral memories (team arena and stack space). */
431 struct ephemeral_memories_list *ephemeral_memories_list;
432 pthread_mutex_t ephemeral_memories_write_lock;
434 /* Read-write lock that protects kernels which are running or about to be run
435 from interference with loading and unloading of images. Needs to be
436 locked for reading while a kernel is being run, and for writing if the
437 list of modules is manipulated (and thus the HSA program invalidated). */
438 pthread_rwlock_t module_rwlock;
440 /* The module associated with this kernel. */
441 struct module_info *module;
443 /* Mutex enforcing that only one thread will finalize the HSA program. A
444 thread should have locked agent->module_rwlock for reading before
445 acquiring it. */
446 pthread_mutex_t prog_mutex;
447 /* Flag whether the HSA program that consists of all the modules has been
448 finalized. */
449 bool prog_finalized;
450 /* HSA executable - the finalized program that is used to locate kernels. */
451 hsa_executable_t executable;
454 /* Information required to identify, finalize and run any given kernel. */
456 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
458 struct kernel_info
460 /* Name of the kernel, required to locate it within the GCN object-code
461 module. */
462 const char *name;
463 /* The specific agent the kernel has been or will be finalized for and run
464 on. */
465 struct agent_info *agent;
466 /* The specific module where the kernel takes place. */
467 struct module_info *module;
468 /* Information provided by mkoffload associated with the kernel. */
469 struct hsa_kernel_description *description;
470 /* Mutex enforcing that at most once thread ever initializes a kernel for
471 use. A thread should have locked agent->module_rwlock for reading before
472 acquiring it. */
473 pthread_mutex_t init_mutex;
474 /* Flag indicating whether the kernel has been initialized and all fields
475 below it contain valid data. */
476 bool initialized;
477 /* Flag indicating that the kernel has a problem that blocks an execution. */
478 bool initialization_failed;
479 /* The object to be put into the dispatch queue. */
480 uint64_t object;
481 /* Required size of kernel arguments. */
482 uint32_t kernarg_segment_size;
483 /* Required size of group segment. */
484 uint32_t group_segment_size;
485 /* Required size of private segment. */
486 uint32_t private_segment_size;
487 /* Set up for OpenMP or OpenACC? */
488 enum offload_kind kind;
491 /* Information about a particular GCN module, its image and kernels. */
493 struct module_info
495 /* The description with which the program has registered the image. */
496 struct gcn_image_desc *image_desc;
497 /* GCN heap allocation. */
498 struct heap *heap;
499 /* Physical boundaries of the loaded module. */
500 Elf64_Addr phys_address_start;
501 Elf64_Addr phys_address_end;
503 bool constructors_run_p;
504 struct kernel_info *init_array_func, *fini_array_func;
506 /* Number of kernels in this module. */
507 int kernel_count;
508 /* An array of kernel_info structures describing each kernel in this
509 module. */
510 struct kernel_info kernels[];
513 /* A linked list of memory arenas allocated on the device.
514 These are used by OpenMP, as a means to optimize per-team malloc,
515 and for host-accessible stack space. */
517 struct ephemeral_memories_list
519 struct ephemeral_memories_list *next;
521 /* The size is determined by the number of teams and threads. */
522 size_t size;
523 /* The device address allocated memory. */
524 void *address;
525 /* A flag to prevent two asynchronous kernels trying to use the same memory.
526 The mutex is locked until the kernel exits. */
527 pthread_mutex_t in_use;
530 /* Information about the whole HSA environment and all of its agents. */
532 struct hsa_context_info
534 /* Whether the structure has been initialized. */
535 bool initialized;
536 /* Number of usable GPU HSA agents in the system. */
537 int agent_count;
538 /* Array of agent_info structures describing the individual HSA agents. */
539 struct agent_info *agents;
540 /* Driver version string. */
541 char driver_version_s[30];
544 /* }}} */
545 /* {{{ Global variables */
547 /* Information about the whole HSA environment and all of its agents. */
549 static struct hsa_context_info hsa_context;
551 /* HSA runtime functions that are initialized in init_hsa_context. */
553 static struct hsa_runtime_fn_info hsa_fns;
555 /* Heap space, allocated target-side, provided for use of newlib malloc.
556 Each module should have it's own heap allocated.
557 Beware that heap usage increases with OpenMP teams. See also arenas. */
559 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
561 /* Ephemeral memory sizes for each kernel launch. */
563 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
564 static int stack_size = DEFAULT_GCN_STACK_SIZE;
565 static int lowlat_size = -1;
567 /* Flag to decide whether print to stderr information about what is going on.
568 Set in init_debug depending on environment variables. */
570 static bool debug;
572 /* Flag to decide if the runtime should suppress a possible fallback to host
573 execution. */
575 static bool suppress_host_fallback;
577 /* Flag to locate HSA runtime shared library that is dlopened
578 by this plug-in. */
580 static const char *hsa_runtime_lib;
582 /* Flag to decide if the runtime should support also CPU devices (can be
583 a simulator). */
585 static bool support_cpu_devices;
587 /* Runtime dimension overrides. Zero indicates default. */
589 static int override_x_dim = 0;
590 static int override_z_dim = 0;
592 /* }}} */
593 /* {{{ Debug & Diagnostic */
595 /* Print a message to stderr if GCN_DEBUG value is set to true. */
597 #define DEBUG_PRINT(...) \
598 do \
600 if (debug) \
602 fprintf (stderr, __VA_ARGS__); \
605 while (false);
607 /* Flush stderr if GCN_DEBUG value is set to true. */
609 #define DEBUG_FLUSH() \
610 do { \
611 if (debug) \
612 fflush (stderr); \
613 } while (false)
615 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
616 is set to true. */
618 #define DEBUG_LOG(prefix, ...) \
619 do \
621 DEBUG_PRINT (prefix); \
622 DEBUG_PRINT (__VA_ARGS__); \
623 DEBUG_FLUSH (); \
624 } while (false)
626 /* Print a debugging message to stderr. */
628 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
630 /* Print a warning message to stderr. */
632 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
634 /* Print HSA warning STR with an HSA STATUS code. */
636 static void
637 hsa_warn (const char *str, hsa_status_t status)
639 if (!debug)
640 return;
642 const char *hsa_error_msg = "[unknown]";
643 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
645 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
646 hsa_error_msg);
649 /* Report a fatal error STR together with the HSA error corresponding to STATUS
650 and terminate execution of the current process. */
652 static void
653 hsa_fatal (const char *str, hsa_status_t status)
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
658 hsa_error_msg);
661 /* Like hsa_fatal, except only report error message, and return FALSE
662 for propagating error processing to outside of plugin. */
664 static bool
665 hsa_error (const char *str, hsa_status_t status)
667 const char *hsa_error_msg = "[unknown]";
668 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
669 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
670 hsa_error_msg);
671 return false;
674 /* Dump information about the available hardware. */
676 static void
677 dump_hsa_system_info (void)
679 hsa_status_t status;
681 hsa_endianness_t endianness;
682 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
683 &endianness);
684 if (status == HSA_STATUS_SUCCESS)
685 switch (endianness)
687 case HSA_ENDIANNESS_LITTLE:
688 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
689 break;
690 case HSA_ENDIANNESS_BIG:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
692 break;
693 default:
694 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
696 else
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
699 uint8_t extensions[128];
700 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
701 &extensions);
702 if (status == HSA_STATUS_SUCCESS)
704 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
705 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
707 else
708 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
711 /* Dump information about the available hardware. */
713 static void
714 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
716 switch (machine_model)
718 case HSA_MACHINE_MODEL_SMALL:
719 GCN_DEBUG ("%s: SMALL\n", s);
720 break;
721 case HSA_MACHINE_MODEL_LARGE:
722 GCN_DEBUG ("%s: LARGE\n", s);
723 break;
724 default:
725 GCN_WARNING ("%s: UNKNOWN\n", s);
726 break;
730 /* Dump information about the available hardware. */
732 static void
733 dump_profile (hsa_profile_t profile, const char *s)
735 switch (profile)
737 case HSA_PROFILE_FULL:
738 GCN_DEBUG ("%s: FULL\n", s);
739 break;
740 case HSA_PROFILE_BASE:
741 GCN_DEBUG ("%s: BASE\n", s);
742 break;
743 default:
744 GCN_WARNING ("%s: UNKNOWN\n", s);
745 break;
749 /* Dump information about a device memory region. */
751 static hsa_status_t
752 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
754 hsa_status_t status;
756 hsa_region_segment_t segment;
757 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
758 &segment);
759 if (status == HSA_STATUS_SUCCESS)
761 if (segment == HSA_REGION_SEGMENT_GLOBAL)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
763 else if (segment == HSA_REGION_SEGMENT_READONLY)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
765 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
766 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
767 else if (segment == HSA_REGION_SEGMENT_GROUP)
768 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
769 else
770 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
772 else
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
775 if (segment == HSA_REGION_SEGMENT_GLOBAL)
777 uint32_t flags;
778 status
779 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
780 &flags);
781 if (status == HSA_STATUS_SUCCESS)
783 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
785 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
786 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
787 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
788 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
790 else
791 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
794 size_t size;
795 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
796 if (status == HSA_STATUS_SUCCESS)
797 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
798 else
799 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
801 status
802 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
803 &size);
804 if (status == HSA_STATUS_SUCCESS)
805 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
806 else
807 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
809 bool alloc_allowed;
810 status
811 = hsa_fns.hsa_region_get_info_fn (region,
812 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
813 &alloc_allowed);
814 if (status == HSA_STATUS_SUCCESS)
815 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
816 else
817 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
819 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
820 return HSA_STATUS_SUCCESS;
822 status
823 = hsa_fns.hsa_region_get_info_fn (region,
824 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
825 &size);
826 if (status == HSA_STATUS_SUCCESS)
827 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
828 else
829 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
831 size_t align;
832 status
833 = hsa_fns.hsa_region_get_info_fn (region,
834 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
835 &align);
836 if (status == HSA_STATUS_SUCCESS)
837 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
838 else
839 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
841 return HSA_STATUS_SUCCESS;
844 /* Dump information about all the device memory regions. */
846 static void
847 dump_hsa_regions (hsa_agent_t agent)
849 hsa_status_t status;
850 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
851 dump_hsa_region,
852 NULL);
853 if (status != HSA_STATUS_SUCCESS)
854 hsa_error ("Dumping hsa regions failed", status);
857 /* Dump information about the available devices. */
859 static hsa_status_t
860 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
862 hsa_status_t status;
864 char buf[64];
865 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
866 &buf);
867 if (status == HSA_STATUS_SUCCESS)
868 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
869 else
870 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
872 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
873 &buf);
874 if (status == HSA_STATUS_SUCCESS)
875 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
876 else
877 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
879 hsa_machine_model_t machine_model;
880 status
881 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
882 &machine_model);
883 if (status == HSA_STATUS_SUCCESS)
884 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
885 else
886 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
888 hsa_profile_t profile;
889 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
890 &profile);
891 if (status == HSA_STATUS_SUCCESS)
892 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
893 else
894 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
896 hsa_device_type_t device_type;
897 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
898 &device_type);
899 if (status == HSA_STATUS_SUCCESS)
901 switch (device_type)
903 case HSA_DEVICE_TYPE_CPU:
904 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
905 break;
906 case HSA_DEVICE_TYPE_GPU:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
908 break;
909 case HSA_DEVICE_TYPE_DSP:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
911 break;
912 default:
913 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
914 break;
917 else
918 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
920 uint32_t cu_count;
921 status = hsa_fns.hsa_agent_get_info_fn
922 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
923 if (status == HSA_STATUS_SUCCESS)
924 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
925 else
926 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
928 uint32_t size;
929 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
930 &size);
931 if (status == HSA_STATUS_SUCCESS)
932 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
933 else
934 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
936 uint32_t max_dim;
937 status = hsa_fns.hsa_agent_get_info_fn (agent,
938 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
939 &max_dim);
940 if (status == HSA_STATUS_SUCCESS)
941 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
942 else
943 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
945 uint32_t max_size;
946 status = hsa_fns.hsa_agent_get_info_fn (agent,
947 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
948 &max_size);
949 if (status == HSA_STATUS_SUCCESS)
950 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
951 else
952 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
954 uint32_t grid_max_dim;
955 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
956 &grid_max_dim);
957 if (status == HSA_STATUS_SUCCESS)
958 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
959 else
960 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
962 uint32_t grid_max_size;
963 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
964 &grid_max_size);
965 if (status == HSA_STATUS_SUCCESS)
966 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
967 else
968 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
970 dump_hsa_regions (agent);
972 return HSA_STATUS_SUCCESS;
975 /* Forward reference. */
977 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
979 /* Helper function for dump_executable_symbols. */
981 static hsa_status_t
982 dump_executable_symbol (hsa_executable_t executable,
983 hsa_executable_symbol_t symbol,
984 void *data __attribute__((unused)))
986 char *name = get_executable_symbol_name (symbol);
988 if (name)
990 GCN_DEBUG ("executable symbol: %s\n", name);
991 free (name);
994 return HSA_STATUS_SUCCESS;
997 /* Dump all global symbol in an executable. */
999 static void
1000 dump_executable_symbols (hsa_executable_t executable)
1002 hsa_status_t status;
1003 status
1004 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1005 dump_executable_symbol,
1006 NULL);
1007 if (status != HSA_STATUS_SUCCESS)
1008 hsa_fatal ("Could not dump HSA executable symbols", status);
1011 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1013 static void
1014 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1016 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1018 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1019 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1020 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1021 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1022 (void*)kernargs->abi.heap_ptr);
1023 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1024 "", (void*)kernargs->abi.arena_ptr,
1025 kernargs->abi.arena_size_per_team);
1026 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1027 "", (void*)kernargs->abi.stack_ptr,
1028 kernargs->abi.stack_size_per_thread);
1029 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1030 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1031 dispatch->private_segment_size);
1032 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1033 "", dispatch->group_segment_size);
1034 fprintf (stderr, "\n");
1037 /* }}} */
1038 /* {{{ Utility functions */
1040 /* Cast the thread local storage to gcn_thread. */
1042 static inline struct gcn_thread *
1043 gcn_thread (void)
1045 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1048 /* Initialize debug and suppress_host_fallback according to the environment. */
1050 static void
1051 init_environment_variables (void)
1053 if (secure_getenv ("GCN_DEBUG"))
1054 debug = true;
1055 else
1056 debug = false;
1058 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1059 suppress_host_fallback = true;
1060 else
1061 suppress_host_fallback = false;
1063 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1064 if (hsa_runtime_lib == NULL)
1065 hsa_runtime_lib = "libhsa-runtime64.so.1";
1067 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1069 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1070 if (!x)
1071 x = secure_getenv ("GCN_NUM_GANGS");
1072 if (x)
1073 override_x_dim = atoi (x);
1075 const char *z = secure_getenv ("GCN_NUM_THREADS");
1076 if (!z)
1077 z = secure_getenv ("GCN_NUM_WORKERS");
1078 if (z)
1079 override_z_dim = atoi (z);
1081 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1082 if (heap)
1084 size_t tmp = atol (heap);
1085 if (tmp)
1086 gcn_kernel_heap_size = tmp;
1089 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1090 if (arena)
1092 int tmp = atoi (arena);
1093 if (tmp)
1094 team_arena_size = tmp;;
1097 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1098 if (stack)
1100 int tmp = atoi (stack);
1101 if (tmp)
1102 stack_size = tmp;;
1105 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1106 if (lowlat)
1107 lowlat_size = atoi (lowlat);
1110 /* Return malloc'd string with name of SYMBOL. */
1112 static char *
1113 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1115 hsa_status_t status;
1116 char *res;
1117 uint32_t len;
1118 const hsa_executable_symbol_info_t info_name_length
1119 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1121 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1122 &len);
1123 if (status != HSA_STATUS_SUCCESS)
1125 hsa_error ("Could not get length of symbol name", status);
1126 return NULL;
1129 res = GOMP_PLUGIN_malloc (len + 1);
1131 const hsa_executable_symbol_info_t info_name
1132 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1134 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1136 if (status != HSA_STATUS_SUCCESS)
1138 hsa_error ("Could not get symbol name", status);
1139 free (res);
1140 return NULL;
1143 res[len] = '\0';
1145 return res;
1148 /* Get the number of GPU Compute Units. */
1150 static int
1151 get_cu_count (struct agent_info *agent)
1153 uint32_t cu_count;
1154 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1155 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1156 if (status == HSA_STATUS_SUCCESS)
1157 return cu_count;
1158 else
1159 return 64; /* The usual number for older devices. */
1162 /* Calculate the maximum grid size for OMP threads / OACC workers.
1163 This depends on the kernel's resource usage levels. */
1165 static int
1166 limit_worker_threads (int threads)
1168 /* FIXME Do something more inteligent here.
1169 GCN can always run 4 threads within a Compute Unit, but
1170 more than that depends on register usage. */
1171 if (threads > 16)
1172 threads = 16;
1173 return threads;
1176 /* This sets the maximum number of teams to twice the number of GPU Compute
1177 Units to avoid memory waste and corresponding memory access faults. */
1179 static int
1180 limit_teams (int teams, struct agent_info *agent)
1182 int max_teams = 2 * get_cu_count (agent);
1183 if (teams > max_teams)
1184 teams = max_teams;
1185 return teams;
1188 /* Parse the target attributes INPUT provided by the compiler and return true
1189 if we should run anything all. If INPUT is NULL, fill DEF with default
1190 values, then store INPUT or DEF into *RESULT.
1192 This is used for OpenMP only. */
1194 static bool
1195 parse_target_attributes (void **input,
1196 struct GOMP_kernel_launch_attributes *def,
1197 struct GOMP_kernel_launch_attributes **result,
1198 struct agent_info *agent)
1200 if (!input)
1201 GOMP_PLUGIN_fatal ("No target arguments provided");
1203 bool grid_attrs_found = false;
1204 bool gcn_dims_found = false;
1205 int gcn_teams = 0;
1206 int gcn_threads = 0;
1207 while (*input)
1209 intptr_t id = (intptr_t) *input++, val;
1211 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1212 val = (intptr_t) *input++;
1213 else
1214 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1216 val = (val > INT_MAX) ? INT_MAX : val;
1218 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1219 && ((id & GOMP_TARGET_ARG_ID_MASK)
1220 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1222 grid_attrs_found = true;
1223 break;
1225 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1226 == GOMP_TARGET_ARG_DEVICE_ALL)
1228 gcn_dims_found = true;
1229 switch (id & GOMP_TARGET_ARG_ID_MASK)
1231 case GOMP_TARGET_ARG_NUM_TEAMS:
1232 gcn_teams = limit_teams (val, agent);
1233 break;
1234 case GOMP_TARGET_ARG_THREAD_LIMIT:
1235 gcn_threads = limit_worker_threads (val);
1236 break;
1237 default:
1243 if (gcn_dims_found)
1245 bool gfx900_workaround_p = false;
1247 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1248 && gcn_threads == 0 && override_z_dim == 0)
1250 gfx900_workaround_p = true;
1251 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1252 "threads to at most 4 per team.\n");
1253 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1254 "GCN_NUM_THREADS=16\n");
1257 /* Ideally, when a dimension isn't explicitly specified, we should
1258 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1259 In practice, we tune for peak performance on BabelStream, which
1260 for OpenACC is currently 32 threads per CU. */
1261 def->ndim = 3;
1262 if (gcn_teams <= 0 && gcn_threads <= 0)
1264 /* Set up a reasonable number of teams and threads. */
1265 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1266 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1267 def->gdims[2] = gcn_threads;
1269 else if (gcn_teams <= 0 && gcn_threads > 0)
1271 /* Auto-scale the number of teams with the number of threads. */
1272 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1273 def->gdims[2] = gcn_threads;
1275 else if (gcn_teams > 0 && gcn_threads <= 0)
1277 int max_threads = gfx900_workaround_p ? 4 : 16;
1279 /* Auto-scale the number of threads with the number of teams. */
1280 def->gdims[0] = gcn_teams;
1281 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1282 if (def->gdims[2] == 0)
1283 def->gdims[2] = 1;
1284 else if (def->gdims[2] > max_threads)
1285 def->gdims[2] = max_threads;
1287 else
1289 def->gdims[0] = gcn_teams;
1290 def->gdims[2] = gcn_threads;
1292 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1293 def->wdims[0] = 1; /* Single team per work-group. */
1294 def->wdims[1] = 64;
1295 def->wdims[2] = 16;
1296 *result = def;
1297 return true;
1299 else if (!grid_attrs_found)
1301 def->ndim = 1;
1302 def->gdims[0] = 1;
1303 def->gdims[1] = 1;
1304 def->gdims[2] = 1;
1305 def->wdims[0] = 1;
1306 def->wdims[1] = 1;
1307 def->wdims[2] = 1;
1308 *result = def;
1309 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1310 return true;
1313 struct GOMP_kernel_launch_attributes *kla;
1314 kla = (struct GOMP_kernel_launch_attributes *) *input;
1315 *result = kla;
1316 if (kla->ndim == 0 || kla->ndim > 3)
1317 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1319 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1320 unsigned i;
1321 for (i = 0; i < kla->ndim; i++)
1323 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1324 kla->gdims[i], kla->wdims[i]);
1325 if (kla->gdims[i] == 0)
1326 return false;
1328 return true;
1331 /* Return the group size given the requested GROUP size, GRID size and number
1332 of grid dimensions NDIM. */
1334 static uint32_t
1335 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1337 if (group == 0)
1339 /* TODO: Provide a default via environment or device characteristics. */
1340 if (ndim == 1)
1341 group = 64;
1342 else if (ndim == 2)
1343 group = 8;
1344 else
1345 group = 4;
1348 if (group > grid)
1349 group = grid;
1350 return group;
1353 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1355 static void
1356 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1358 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1361 /* A never-called callback for the HSA command queues. These signal events
1362 that we don't use, so we trigger an error.
1364 This "queue" is not to be confused with the async queues, below. */
1366 static void
1367 hsa_queue_callback (hsa_status_t status,
1368 hsa_queue_t *queue __attribute__ ((unused)),
1369 void *data __attribute__ ((unused)))
1371 hsa_fatal ("Asynchronous queue error", status);
1374 /* }}} */
1375 /* {{{ HSA initialization */
1377 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1379 static bool
1380 init_hsa_runtime_functions (void)
1382 #define DLSYM_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function); \
1384 if (hsa_fns.function##_fn == NULL) \
1385 return false;
1386 #define DLSYM_OPT_FN(function) \
1387 hsa_fns.function##_fn = dlsym (handle, #function);
1388 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1389 if (handle == NULL)
1390 return false;
1392 DLSYM_FN (hsa_status_string)
1393 DLSYM_FN (hsa_system_get_info)
1394 DLSYM_FN (hsa_agent_get_info)
1395 DLSYM_FN (hsa_init)
1396 DLSYM_FN (hsa_iterate_agents)
1397 DLSYM_FN (hsa_region_get_info)
1398 DLSYM_FN (hsa_queue_create)
1399 DLSYM_FN (hsa_agent_iterate_regions)
1400 DLSYM_FN (hsa_executable_destroy)
1401 DLSYM_FN (hsa_executable_create)
1402 DLSYM_FN (hsa_executable_global_variable_define)
1403 DLSYM_FN (hsa_executable_load_code_object)
1404 DLSYM_FN (hsa_executable_freeze)
1405 DLSYM_FN (hsa_signal_create)
1406 DLSYM_FN (hsa_memory_allocate)
1407 DLSYM_FN (hsa_memory_assign_agent)
1408 DLSYM_FN (hsa_memory_copy)
1409 DLSYM_FN (hsa_memory_free)
1410 DLSYM_FN (hsa_signal_destroy)
1411 DLSYM_FN (hsa_executable_get_symbol)
1412 DLSYM_FN (hsa_executable_symbol_get_info)
1413 DLSYM_FN (hsa_executable_iterate_symbols)
1414 DLSYM_FN (hsa_queue_add_write_index_release)
1415 DLSYM_FN (hsa_queue_load_read_index_acquire)
1416 DLSYM_FN (hsa_signal_wait_acquire)
1417 DLSYM_FN (hsa_signal_store_relaxed)
1418 DLSYM_FN (hsa_signal_store_release)
1419 DLSYM_FN (hsa_signal_load_acquire)
1420 DLSYM_FN (hsa_queue_destroy)
1421 DLSYM_FN (hsa_code_object_deserialize)
1422 DLSYM_OPT_FN (hsa_amd_memory_lock)
1423 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1424 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1425 return true;
1426 #undef DLSYM_OPT_FN
1427 #undef DLSYM_FN
1430 static gcn_isa isa_code (const char *isa);
1432 /* Return true if the agent is a GPU and can accept of concurrent submissions
1433 from different threads. */
1435 static bool
1436 suitable_hsa_agent_p (hsa_agent_t agent)
1438 hsa_device_type_t device_type;
1439 hsa_status_t status
1440 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1441 &device_type);
1442 if (status != HSA_STATUS_SUCCESS)
1443 return false;
1445 switch (device_type)
1447 case HSA_DEVICE_TYPE_GPU:
1449 char name[64];
1450 hsa_status_t status
1451 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1452 if (status != HSA_STATUS_SUCCESS
1453 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1455 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1456 status == HSA_STATUS_SUCCESS ? name : "invalid");
1457 return false;
1460 break;
1461 case HSA_DEVICE_TYPE_CPU:
1462 if (!support_cpu_devices)
1463 return false;
1464 break;
1465 default:
1466 return false;
1469 uint32_t features = 0;
1470 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1471 &features);
1472 if (status != HSA_STATUS_SUCCESS
1473 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1474 return false;
1475 hsa_queue_type_t queue_type;
1476 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1477 &queue_type);
1478 if (status != HSA_STATUS_SUCCESS
1479 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1480 return false;
1482 return true;
1485 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1486 agent_count in hsa_context. */
1488 static hsa_status_t
1489 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1491 if (suitable_hsa_agent_p (agent))
1492 hsa_context.agent_count++;
1493 return HSA_STATUS_SUCCESS;
1496 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1497 id to the describing structure in the hsa context. The index of the
1498 structure is pointed to by DATA, increment it afterwards. */
1500 static hsa_status_t
1501 assign_agent_ids (hsa_agent_t agent, void *data)
1503 if (suitable_hsa_agent_p (agent))
1505 int *agent_index = (int *) data;
1506 hsa_context.agents[*agent_index].id = agent;
1507 ++*agent_index;
1509 return HSA_STATUS_SUCCESS;
1512 /* Initialize hsa_context if it has not already been done.
1513 Return TRUE on success. */
1515 static bool
1516 init_hsa_context (void)
1518 hsa_status_t status;
1519 int agent_index = 0;
1521 if (hsa_context.initialized)
1522 return true;
1523 init_environment_variables ();
1524 if (!init_hsa_runtime_functions ())
1526 GCN_WARNING ("Run-time could not be dynamically opened\n");
1527 if (suppress_host_fallback)
1528 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1529 return false;
1531 status = hsa_fns.hsa_init_fn ();
1532 if (status != HSA_STATUS_SUCCESS)
1533 return hsa_error ("Run-time could not be initialized", status);
1534 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1536 if (debug)
1537 dump_hsa_system_info ();
1539 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1540 if (status != HSA_STATUS_SUCCESS)
1541 return hsa_error ("GCN GPU devices could not be enumerated", status);
1542 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1544 hsa_context.agents
1545 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1546 * sizeof (struct agent_info));
1547 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1548 if (status != HSA_STATUS_SUCCESS)
1549 return hsa_error ("Scanning compute agents failed", status);
1550 if (agent_index != hsa_context.agent_count)
1552 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1553 return false;
1556 if (debug)
1558 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1559 if (status != HSA_STATUS_SUCCESS)
1560 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1563 uint16_t minor, major;
1564 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1565 &minor);
1566 if (status != HSA_STATUS_SUCCESS)
1567 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1568 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1569 &major);
1570 if (status != HSA_STATUS_SUCCESS)
1571 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1573 size_t len = sizeof hsa_context.driver_version_s;
1574 int printed = snprintf (hsa_context.driver_version_s, len,
1575 "HSA Runtime %hu.%hu", (unsigned short int)major,
1576 (unsigned short int)minor);
1577 if (printed >= len)
1578 GCN_WARNING ("HSA runtime version string was truncated."
1579 "Version %hu.%hu is too long.", (unsigned short int)major,
1580 (unsigned short int)minor);
1582 hsa_context.initialized = true;
1583 return true;
1586 /* Verify that hsa_context has already been initialized and return the
1587 agent_info structure describing device number N. Return NULL on error. */
1589 static struct agent_info *
1590 get_agent_info (int n)
1592 if (!hsa_context.initialized)
1594 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1595 return NULL;
1597 if (n >= hsa_context.agent_count)
1599 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1600 return NULL;
1602 if (!hsa_context.agents[n].initialized)
1604 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1605 return NULL;
1607 return &hsa_context.agents[n];
1610 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1612 Selects (breaks at) a suitable region of type KIND. */
1614 static hsa_status_t
1615 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1616 hsa_region_global_flag_t kind)
1618 hsa_status_t status;
1619 hsa_region_segment_t segment;
1621 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1622 &segment);
1623 if (status != HSA_STATUS_SUCCESS)
1624 return status;
1625 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1626 return HSA_STATUS_SUCCESS;
1628 uint32_t flags;
1629 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1630 &flags);
1631 if (status != HSA_STATUS_SUCCESS)
1632 return status;
1633 if (flags & kind)
1635 *retval = region;
1636 return HSA_STATUS_INFO_BREAK;
1638 return HSA_STATUS_SUCCESS;
1641 /* Callback of hsa_agent_iterate_regions.
1643 Selects a kernargs memory region. */
1645 static hsa_status_t
1646 get_kernarg_memory_region (hsa_region_t region, void *data)
1648 return get_memory_region (region, (hsa_region_t *)data,
1649 HSA_REGION_GLOBAL_FLAG_KERNARG);
1652 /* Callback of hsa_agent_iterate_regions.
1654 Selects a coarse-grained memory region suitable for the heap and
1655 offload data. */
1657 static hsa_status_t
1658 get_data_memory_region (hsa_region_t region, void *data)
1660 return get_memory_region (region, (hsa_region_t *)data,
1661 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1664 static int
1665 elf_gcn_isa_field (Elf64_Ehdr *image)
1667 return image->e_flags & EF_AMDGPU_MACH_MASK;
1670 const static char *gcn_gfx803_s = "gfx803";
1671 const static char *gcn_gfx900_s = "gfx900";
1672 const static char *gcn_gfx906_s = "gfx906";
1673 const static char *gcn_gfx908_s = "gfx908";
1674 const static char *gcn_gfx90a_s = "gfx90a";
1675 const static char *gcn_gfx1030_s = "gfx1030";
1676 const static char *gcn_gfx1100_s = "gfx1100";
1677 const static int gcn_isa_name_len = 7;
1679 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1680 support the ISA. */
1682 static const char*
1683 isa_hsa_name (int isa) {
1684 switch(isa)
1686 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1687 return gcn_gfx803_s;
1688 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1689 return gcn_gfx900_s;
1690 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1691 return gcn_gfx906_s;
1692 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1693 return gcn_gfx908_s;
1694 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1695 return gcn_gfx90a_s;
1696 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1697 return gcn_gfx1030_s;
1698 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1699 return gcn_gfx1100_s;
1701 return NULL;
1704 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1705 with -march) or NULL if we do not support the ISA.
1706 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1708 static const char*
1709 isa_gcc_name (int isa) {
1710 switch(isa)
1712 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1713 return "fiji";
1714 default:
1715 return isa_hsa_name (isa);
1719 /* Returns the code which is used in the GCN object code to identify the ISA with
1720 the given name (as used by the HSA runtime). */
1722 static gcn_isa
1723 isa_code(const char *isa) {
1724 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1725 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1727 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1728 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1730 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1731 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1733 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1734 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1736 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1737 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1739 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1740 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1742 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1743 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1745 return EF_AMDGPU_MACH_UNSUPPORTED;
1748 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1750 static int
1751 max_isa_vgprs (int isa)
1753 switch (isa)
1755 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1756 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1757 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1758 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1759 return 256;
1760 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1761 return 512;
1762 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1763 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1764 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1765 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
1767 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1770 /* }}} */
1771 /* {{{ Run */
1773 /* Create or reuse a team arena and stack space.
1775 Team arenas are used by OpenMP to avoid calling malloc multiple times
1776 while setting up each team. This is purely a performance optimization.
1778 The stack space is used by all kernels. We must allocate it in such a
1779 way that the reverse offload implmentation can access the data.
1781 Allocating this memory costs performance, so this function will reuse an
1782 existing allocation if a large enough one is idle.
1783 The memory lock is released, but not deallocated, when the kernel exits. */
1785 static void
1786 configure_ephemeral_memories (struct kernel_info *kernel,
1787 struct kernargs_abi *kernargs, int num_teams,
1788 int num_threads)
1790 struct agent_info *agent = kernel->agent;
1791 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1792 struct ephemeral_memories_list *item;
1794 int actual_arena_size = (kernel->kind == KIND_OPENMP
1795 ? team_arena_size : 0);
1796 int actual_arena_total_size = actual_arena_size * num_teams;
1797 size_t size = (actual_arena_total_size
1798 + num_teams * num_threads * stack_size);
1800 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1802 if (item->size < size)
1803 continue;
1805 if (pthread_mutex_trylock (&item->in_use) == 0)
1806 break;
1809 if (!item)
1811 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1812 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1813 num_teams, num_threads, size);
1815 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1817 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1818 return;
1820 item = malloc (sizeof (*item));
1821 item->size = size;
1822 item->next = NULL;
1823 *next_ptr = item;
1825 if (pthread_mutex_init (&item->in_use, NULL))
1827 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1828 return;
1830 if (pthread_mutex_lock (&item->in_use))
1832 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1833 return;
1835 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1837 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1838 return;
1841 hsa_status_t status;
1842 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1843 &item->address);
1844 if (status != HSA_STATUS_SUCCESS)
1845 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1846 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1847 HSA_ACCESS_PERMISSION_RW);
1848 if (status != HSA_STATUS_SUCCESS)
1849 hsa_fatal ("Could not assign arena & stack memory to device", status);
1852 kernargs->arena_ptr = (actual_arena_total_size
1853 ? (uint64_t)item->address
1854 : 0);
1855 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1856 kernargs->arena_size_per_team = actual_arena_size;
1857 kernargs->stack_size_per_thread = stack_size;
1860 /* Mark an ephemeral memory space available for reuse. */
1862 static void
1863 release_ephemeral_memories (struct agent_info* agent, void *address)
1865 struct ephemeral_memories_list *item;
1867 for (item = agent->ephemeral_memories_list; item; item = item->next)
1869 if (item->address == address)
1871 if (pthread_mutex_unlock (&item->in_use))
1872 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1873 return;
1876 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1879 /* Clean up all the allocated team arenas. */
1881 static bool
1882 destroy_ephemeral_memories (struct agent_info *agent)
1884 struct ephemeral_memories_list *item, *next;
1886 for (item = agent->ephemeral_memories_list; item; item = next)
1888 next = item->next;
1889 hsa_fns.hsa_memory_free_fn (item->address);
1890 if (pthread_mutex_destroy (&item->in_use))
1892 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1893 return false;
1895 free (item);
1897 agent->ephemeral_memories_list = NULL;
1899 return true;
1902 /* Allocate memory on a specified device. */
1904 static void *
1905 alloc_by_agent (struct agent_info *agent, size_t size)
1907 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1909 void *ptr;
1910 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1911 size, &ptr);
1912 if (status != HSA_STATUS_SUCCESS)
1914 hsa_error ("Could not allocate device memory", status);
1915 return NULL;
1918 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1919 HSA_ACCESS_PERMISSION_RW);
1920 if (status != HSA_STATUS_SUCCESS)
1922 hsa_error ("Could not assign data memory to device", status);
1923 return NULL;
1926 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1927 bool profiling_dispatch_p
1928 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1929 if (profiling_dispatch_p)
1931 acc_prof_info *prof_info = thr->prof_info;
1932 acc_event_info data_event_info;
1933 acc_api_info *api_info = thr->api_info;
1935 prof_info->event_type = acc_ev_alloc;
1937 data_event_info.data_event.event_type = prof_info->event_type;
1938 data_event_info.data_event.valid_bytes
1939 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1940 data_event_info.data_event.parent_construct
1941 = acc_construct_parallel;
1942 data_event_info.data_event.implicit = 1;
1943 data_event_info.data_event.tool_info = NULL;
1944 data_event_info.data_event.var_name = NULL;
1945 data_event_info.data_event.bytes = size;
1946 data_event_info.data_event.host_ptr = NULL;
1947 data_event_info.data_event.device_ptr = (void *) ptr;
1949 api_info->device_api = acc_device_api_other;
1951 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1952 api_info);
1955 return ptr;
1958 /* Create kernel dispatch data structure for given KERNEL, along with
1959 the necessary device signals and memory allocations. */
1961 static struct kernel_dispatch *
1962 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1963 int num_threads)
1965 struct agent_info *agent = kernel->agent;
1966 struct kernel_dispatch *shadow
1967 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1969 shadow->agent = kernel->agent;
1970 shadow->object = kernel->object;
1972 hsa_signal_t sync_signal;
1973 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1974 if (status != HSA_STATUS_SUCCESS)
1975 hsa_fatal ("Error creating the GCN sync signal", status);
1977 shadow->signal = sync_signal.handle;
1978 shadow->private_segment_size = kernel->private_segment_size;
1980 if (lowlat_size < 0)
1982 /* Divide the LDS between the number of running teams.
1983 Allocate not less than is defined in the kernel metadata. */
1984 int teams_per_cu = num_teams / get_cu_count (agent);
1985 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1986 shadow->group_segment_size
1987 = (kernel->group_segment_size > LDS_per_team
1988 ? kernel->group_segment_size
1989 : LDS_per_team);;
1991 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1992 /* Ensure that there's space for the OpenMP libgomp data. */
1993 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1994 else
1995 shadow->group_segment_size = (lowlat_size > 65536
1996 ? 65536
1997 : lowlat_size);
1999 /* We expect kernels to request a single pointer, explicitly, and the
2000 rest of struct kernargs, implicitly. If they request anything else
2001 then something is wrong. */
2002 if (kernel->kernarg_segment_size > 8)
2004 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2005 return NULL;
2008 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2009 sizeof (struct kernargs),
2010 &shadow->kernarg_address);
2011 if (status != HSA_STATUS_SUCCESS)
2012 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2013 struct kernargs *kernargs = shadow->kernarg_address;
2015 /* Zero-initialize the output_data (minimum needed). */
2016 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2017 kernargs->output_data.next_output = 0;
2018 for (unsigned i = 0;
2019 i < (sizeof (kernargs->output_data.queue)
2020 / sizeof (kernargs->output_data.queue[0]));
2021 i++)
2022 kernargs->output_data.queue[i].written = 0;
2023 kernargs->output_data.consumed = 0;
2025 /* Pass in the heap location. */
2026 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2028 /* Create the ephemeral memory spaces. */
2029 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2031 /* Ensure we can recognize unset return values. */
2032 kernargs->output_data.return_value = 0xcafe0000;
2034 return shadow;
2037 static void
2038 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2039 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2041 int dev_num = dev_num64;
2042 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2043 NULL);
2046 /* Output any data written to console output from the kernel. It is expected
2047 that this function is polled during kernel execution.
2049 We print all entries from the last item printed to the next entry without
2050 a "written" flag. If the "final" flag is set then it'll continue right to
2051 the end.
2053 The print buffer is circular, but the from and to locations don't wrap when
2054 the buffer does, so the output limit is UINT_MAX. The target blocks on
2055 output when the buffer is full. */
2057 static void
2058 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2059 bool final)
2061 unsigned int limit = (sizeof (kernargs->output_data.queue)
2062 / sizeof (kernargs->output_data.queue[0]));
2064 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2065 __ATOMIC_ACQUIRE);
2066 unsigned int to = kernargs->output_data.next_output;
2068 if (from > to)
2070 /* Overflow. */
2071 if (final)
2072 printf ("GCN print buffer overflowed.\n");
2073 return;
2076 unsigned int i;
2077 for (i = from; i < to; i++)
2079 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2081 if (!data->written && !final)
2082 break;
2084 switch (data->type)
2086 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2087 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2088 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2089 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2090 case 4:
2091 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2092 data->value_u64[2], data->value_u64[3],
2093 data->value_u64[4], data->value_u64[5]);
2094 break;
2095 default: printf ("GCN print buffer error!\n"); break;
2097 data->written = 0;
2098 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2099 __ATOMIC_RELEASE);
2101 fflush (stdout);
2104 /* Release data structure created for a kernel dispatch in SHADOW argument,
2105 and clean up the signal and memory allocations. */
2107 static void
2108 release_kernel_dispatch (struct kernel_dispatch *shadow)
2110 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2112 struct kernargs *kernargs = shadow->kernarg_address;
2113 void *addr = (void *)kernargs->abi.arena_ptr;
2114 if (!addr)
2115 addr = (void *)kernargs->abi.stack_ptr;
2116 release_ephemeral_memories (shadow->agent, addr);
2118 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2120 hsa_signal_t s;
2121 s.handle = shadow->signal;
2122 hsa_fns.hsa_signal_destroy_fn (s);
2124 free (shadow);
2127 /* Extract the properties from a kernel binary. */
2129 static void
2130 init_kernel_properties (struct kernel_info *kernel)
2132 hsa_status_t status;
2133 struct agent_info *agent = kernel->agent;
2134 hsa_executable_symbol_t kernel_symbol;
2135 char *buf = alloca (strlen (kernel->name) + 4);
2136 sprintf (buf, "%s.kd", kernel->name);
2137 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2138 buf, agent->id,
2139 0, &kernel_symbol);
2140 if (status != HSA_STATUS_SUCCESS)
2142 hsa_warn ("Could not find symbol for kernel in the code object", status);
2143 fprintf (stderr, "not found name: '%s'\n", buf);
2144 dump_executable_symbols (agent->executable);
2145 goto failure;
2147 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2148 status = hsa_fns.hsa_executable_symbol_get_info_fn
2149 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2150 if (status != HSA_STATUS_SUCCESS)
2151 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2152 status = hsa_fns.hsa_executable_symbol_get_info_fn
2153 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2154 &kernel->kernarg_segment_size);
2155 if (status != HSA_STATUS_SUCCESS)
2156 hsa_fatal ("Could not get info about kernel argument size", status);
2157 status = hsa_fns.hsa_executable_symbol_get_info_fn
2158 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2159 &kernel->group_segment_size);
2160 if (status != HSA_STATUS_SUCCESS)
2161 hsa_fatal ("Could not get info about kernel group segment size", status);
2162 status = hsa_fns.hsa_executable_symbol_get_info_fn
2163 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2164 &kernel->private_segment_size);
2165 if (status != HSA_STATUS_SUCCESS)
2166 hsa_fatal ("Could not get info about kernel private segment size",
2167 status);
2169 /* The kernel type is not known until something tries to launch it. */
2170 kernel->kind = KIND_UNKNOWN;
2172 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2173 "following segment sizes: \n", kernel->name);
2174 GCN_DEBUG (" group_segment_size: %u\n",
2175 (unsigned) kernel->group_segment_size);
2176 GCN_DEBUG (" private_segment_size: %u\n",
2177 (unsigned) kernel->private_segment_size);
2178 GCN_DEBUG (" kernarg_segment_size: %u\n",
2179 (unsigned) kernel->kernarg_segment_size);
2180 return;
2182 failure:
2183 kernel->initialization_failed = true;
2186 /* Do all the work that is necessary before running KERNEL for the first time.
2187 The function assumes the program has been created, finalized and frozen by
2188 create_and_finalize_hsa_program. */
2190 static void
2191 init_kernel (struct kernel_info *kernel)
2193 if (pthread_mutex_lock (&kernel->init_mutex))
2194 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2195 if (kernel->initialized)
2197 if (pthread_mutex_unlock (&kernel->init_mutex))
2198 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2199 "mutex");
2201 return;
2204 init_kernel_properties (kernel);
2206 if (!kernel->initialization_failed)
2208 GCN_DEBUG ("\n");
2210 kernel->initialized = true;
2212 if (pthread_mutex_unlock (&kernel->init_mutex))
2213 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2214 "mutex");
2217 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2218 launch attributes from KLA.
2220 MODULE_LOCKED indicates that the caller already holds the lock and
2221 run_kernel need not lock it again.
2222 If AQ is NULL then agent->sync_queue will be used. */
2224 static void
2225 run_kernel (struct kernel_info *kernel, void *vars,
2226 struct GOMP_kernel_launch_attributes *kla,
2227 struct goacc_asyncqueue *aq, bool module_locked)
2229 struct agent_info *agent = kernel->agent;
2230 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2231 kernel->description->vpgr_count);
2233 /* Reduce the number of threads/workers if there are insufficient
2234 VGPRs available to run the kernels together. */
2235 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2237 int max_vgprs = max_isa_vgprs (agent->device_isa);
2238 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2239 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2240 if (kla->gdims[2] > max_threads)
2242 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2243 " per team/gang - reducing to %d threads/workers.\n",
2244 kla->gdims[2], max_threads);
2245 kla->gdims[2] = max_threads;
2249 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2250 (aq ? aq->id : 0));
2251 GCN_DEBUG ("GCN launch attribs: gdims:[");
2252 int i;
2253 for (i = 0; i < kla->ndim; ++i)
2255 if (i)
2256 DEBUG_PRINT (", ");
2257 DEBUG_PRINT ("%u", kla->gdims[i]);
2259 DEBUG_PRINT ("], normalized gdims:[");
2260 for (i = 0; i < kla->ndim; ++i)
2262 if (i)
2263 DEBUG_PRINT (", ");
2264 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2266 DEBUG_PRINT ("], wdims:[");
2267 for (i = 0; i < kla->ndim; ++i)
2269 if (i)
2270 DEBUG_PRINT (", ");
2271 DEBUG_PRINT ("%u", kla->wdims[i]);
2273 DEBUG_PRINT ("]\n");
2274 DEBUG_FLUSH ();
2276 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2277 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2279 if (!agent->initialized)
2280 GOMP_PLUGIN_fatal ("Agent must be initialized");
2282 if (!kernel->initialized)
2283 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2285 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2287 uint64_t index
2288 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2289 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2291 /* Wait until the queue is not full before writing the packet. */
2292 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2293 >= command_q->size)
2296 /* Do not allow the dimensions to be overridden when running
2297 constructors or destructors. */
2298 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2299 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2301 hsa_kernel_dispatch_packet_t *packet;
2302 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2303 + index % command_q->size;
2305 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2306 packet->grid_size_x = override_x ? : kla->gdims[0];
2307 packet->workgroup_size_x = get_group_size (kla->ndim,
2308 packet->grid_size_x,
2309 kla->wdims[0]);
2311 if (kla->ndim >= 2)
2313 packet->grid_size_y = kla->gdims[1];
2314 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2315 kla->wdims[1]);
2317 else
2319 packet->grid_size_y = 1;
2320 packet->workgroup_size_y = 1;
2323 if (kla->ndim == 3)
2325 packet->grid_size_z = limit_worker_threads (override_z
2326 ? : kla->gdims[2]);
2327 packet->workgroup_size_z = get_group_size (kla->ndim,
2328 packet->grid_size_z,
2329 kla->wdims[2]);
2331 else
2333 packet->grid_size_z = 1;
2334 packet->workgroup_size_z = 1;
2337 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2338 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2339 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2340 packet->grid_size_x / packet->workgroup_size_x,
2341 packet->grid_size_y / packet->workgroup_size_y,
2342 packet->grid_size_z / packet->workgroup_size_z,
2343 packet->workgroup_size_x, packet->workgroup_size_y,
2344 packet->workgroup_size_z);
2346 struct kernel_dispatch *shadow
2347 = create_kernel_dispatch (kernel, packet->grid_size_x,
2348 packet->grid_size_z);
2349 shadow->queue = command_q;
2351 if (debug)
2353 fprintf (stderr, "\nKernel has following dependencies:\n");
2354 print_kernel_dispatch (shadow, 2);
2357 packet->private_segment_size = shadow->private_segment_size;
2358 packet->group_segment_size = shadow->group_segment_size;
2359 packet->kernel_object = shadow->object;
2360 packet->kernarg_address = shadow->kernarg_address;
2361 hsa_signal_t s;
2362 s.handle = shadow->signal;
2363 packet->completion_signal = s;
2364 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2365 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2367 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2369 uint16_t header;
2370 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2371 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2372 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2374 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2375 agent->device_id);
2377 packet_store_release ((uint32_t *) packet, header,
2378 (uint16_t) kla->ndim
2379 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2381 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2382 index);
2384 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2386 /* Root signal waits with 1ms timeout. */
2387 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2388 1000 * 1000,
2389 HSA_WAIT_STATE_BLOCKED) != 0)
2391 console_output (kernel, shadow->kernarg_address, false);
2393 console_output (kernel, shadow->kernarg_address, true);
2395 struct kernargs *kernargs = shadow->kernarg_address;
2396 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2398 release_kernel_dispatch (shadow);
2400 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2401 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2403 unsigned int upper = (return_value & ~0xffff) >> 16;
2404 if (upper == 0xcafe)
2405 ; // exit not called, normal termination.
2406 else if (upper == 0xffff)
2407 ; // exit called.
2408 else
2410 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2411 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2412 return_value);
2413 abort ();
2416 if (upper == 0xffff)
2418 unsigned int signal = (return_value >> 8) & 0xff;
2420 if (signal == SIGABRT)
2422 GCN_WARNING ("GCN Kernel aborted\n");
2423 abort ();
2425 else if (signal != 0)
2427 GCN_WARNING ("GCN Kernel received unknown signal\n");
2428 abort ();
2431 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2432 exit (return_value & 0xff);
2436 /* }}} */
2437 /* {{{ Load/Unload */
2439 /* Initialize KERNEL from D and other parameters. Return true on success. */
2441 static bool
2442 init_basic_kernel_info (struct kernel_info *kernel,
2443 struct hsa_kernel_description *d,
2444 struct agent_info *agent,
2445 struct module_info *module)
2447 kernel->agent = agent;
2448 kernel->module = module;
2449 kernel->name = d->name;
2450 kernel->description = d;
2451 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2453 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2454 return false;
2456 return true;
2459 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2461 static bool
2462 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2464 int isa_field = elf_gcn_isa_field (image);
2465 const char* isa_s = isa_hsa_name (isa_field);
2466 if (!isa_s)
2468 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2469 return false;
2472 if (isa_field != agent->device_isa)
2474 char msg[120];
2475 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2476 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2477 assert (agent_isa_s);
2478 assert (agent_isa_gcc_s);
2480 snprintf (msg, sizeof msg,
2481 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2482 "Try to recompile with '-foffload-options=-march=%s'.\n",
2483 isa_s, agent_isa_s, agent_isa_gcc_s);
2485 hsa_error (msg, HSA_STATUS_ERROR);
2486 return false;
2489 return true;
2492 /* Create and finalize the program consisting of all loaded modules. */
2494 static bool
2495 create_and_finalize_hsa_program (struct agent_info *agent)
2497 hsa_status_t status;
2498 bool res = true;
2499 if (pthread_mutex_lock (&agent->prog_mutex))
2501 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2502 return false;
2504 if (agent->prog_finalized)
2505 goto final;
2507 status
2508 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2509 HSA_EXECUTABLE_STATE_UNFROZEN,
2510 "", &agent->executable);
2511 if (status != HSA_STATUS_SUCCESS)
2513 hsa_error ("Could not create GCN executable", status);
2514 goto fail;
2517 /* Load any GCN modules. */
2518 struct module_info *module = agent->module;
2519 if (module)
2521 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2523 if (!isa_matches_agent (agent, image))
2524 goto fail;
2526 hsa_code_object_t co = { 0 };
2527 status = hsa_fns.hsa_code_object_deserialize_fn
2528 (module->image_desc->gcn_image->image,
2529 module->image_desc->gcn_image->size,
2530 NULL, &co);
2531 if (status != HSA_STATUS_SUCCESS)
2533 hsa_error ("Could not deserialize GCN code object", status);
2534 goto fail;
2537 status = hsa_fns.hsa_executable_load_code_object_fn
2538 (agent->executable, agent->id, co, "");
2539 if (status != HSA_STATUS_SUCCESS)
2541 hsa_error ("Could not load GCN code object", status);
2542 goto fail;
2545 if (!module->heap)
2547 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2548 gcn_kernel_heap_size,
2549 (void**)&module->heap);
2550 if (status != HSA_STATUS_SUCCESS)
2552 hsa_error ("Could not allocate memory for GCN heap", status);
2553 goto fail;
2556 status = hsa_fns.hsa_memory_assign_agent_fn
2557 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2558 if (status != HSA_STATUS_SUCCESS)
2560 hsa_error ("Could not assign GCN heap memory to device", status);
2561 goto fail;
2564 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2565 &gcn_kernel_heap_size,
2566 sizeof (gcn_kernel_heap_size));
2571 if (debug)
2572 dump_executable_symbols (agent->executable);
2574 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2575 if (status != HSA_STATUS_SUCCESS)
2577 hsa_error ("Could not freeze the GCN executable", status);
2578 goto fail;
2581 final:
2582 agent->prog_finalized = true;
2584 if (pthread_mutex_unlock (&agent->prog_mutex))
2586 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2587 res = false;
2590 return res;
2592 fail:
2593 res = false;
2594 goto final;
2597 /* Free the HSA program in agent and everything associated with it and set
2598 agent->prog_finalized and the initialized flags of all kernels to false.
2599 Return TRUE on success. */
2601 static bool
2602 destroy_hsa_program (struct agent_info *agent)
2604 if (!agent->prog_finalized)
2605 return true;
2607 hsa_status_t status;
2609 GCN_DEBUG ("Destroying the current GCN program.\n");
2611 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2612 if (status != HSA_STATUS_SUCCESS)
2613 return hsa_error ("Could not destroy GCN executable", status);
2615 if (agent->module)
2617 int i;
2618 for (i = 0; i < agent->module->kernel_count; i++)
2619 agent->module->kernels[i].initialized = false;
2621 if (agent->module->heap)
2623 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2624 agent->module->heap = NULL;
2627 agent->prog_finalized = false;
2628 return true;
2631 /* Deinitialize all information associated with MODULE and kernels within
2632 it. Return TRUE on success. */
2634 static bool
2635 destroy_module (struct module_info *module, bool locked)
2637 /* Run destructors before destroying module. */
2638 struct GOMP_kernel_launch_attributes kla =
2639 { 3,
2640 /* Grid size. */
2641 { 1, 64, 1 },
2642 /* Work-group size. */
2643 { 1, 64, 1 }
2646 if (module->fini_array_func)
2648 init_kernel (module->fini_array_func);
2649 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2651 module->constructors_run_p = false;
2653 int i;
2654 for (i = 0; i < module->kernel_count; i++)
2655 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2657 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2658 "mutex");
2659 return false;
2662 return true;
2665 /* }}} */
2666 /* {{{ Async */
2668 /* Callback of dispatch queues to report errors. */
2670 static void
2671 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2673 struct queue_entry *entry = &aq->queue[index];
2675 switch (entry->type)
2677 case KERNEL_LAUNCH:
2678 if (DEBUG_QUEUES)
2679 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2680 aq->agent->device_id, aq->id, index);
2681 run_kernel (entry->u.launch.kernel,
2682 entry->u.launch.vars,
2683 &entry->u.launch.kla, aq, false);
2684 if (DEBUG_QUEUES)
2685 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2686 aq->agent->device_id, aq->id, index);
2687 break;
2689 case CALLBACK:
2690 if (DEBUG_QUEUES)
2691 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2692 aq->agent->device_id, aq->id, index);
2693 entry->u.callback.fn (entry->u.callback.data);
2694 if (DEBUG_QUEUES)
2695 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2696 aq->agent->device_id, aq->id, index);
2697 break;
2699 case ASYNC_WAIT:
2701 /* FIXME: is it safe to access a placeholder that may already have
2702 been executed? */
2703 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2705 if (DEBUG_QUEUES)
2706 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2707 aq->agent->device_id, aq->id, index);
2709 pthread_mutex_lock (&placeholderp->mutex);
2711 while (!placeholderp->executed)
2712 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2714 pthread_mutex_unlock (&placeholderp->mutex);
2716 if (pthread_cond_destroy (&placeholderp->cond))
2717 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2719 if (pthread_mutex_destroy (&placeholderp->mutex))
2720 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2722 if (DEBUG_QUEUES)
2723 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2724 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2726 break;
2728 case ASYNC_PLACEHOLDER:
2729 pthread_mutex_lock (&entry->u.placeholder.mutex);
2730 entry->u.placeholder.executed = 1;
2731 pthread_cond_signal (&entry->u.placeholder.cond);
2732 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2733 break;
2735 default:
2736 GOMP_PLUGIN_fatal ("Unknown queue element");
2740 /* This function is run as a thread to service an async queue in the
2741 background. It runs continuously until the stop flag is set. */
2743 static void *
2744 drain_queue (void *thread_arg)
2746 struct goacc_asyncqueue *aq = thread_arg;
2748 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2750 aq->drain_queue_stop = 2;
2751 return NULL;
2754 pthread_mutex_lock (&aq->mutex);
2756 while (true)
2758 if (aq->drain_queue_stop)
2759 break;
2761 if (aq->queue_n > 0)
2763 pthread_mutex_unlock (&aq->mutex);
2764 execute_queue_entry (aq, aq->queue_first);
2766 pthread_mutex_lock (&aq->mutex);
2767 aq->queue_first = ((aq->queue_first + 1)
2768 % ASYNC_QUEUE_SIZE);
2769 aq->queue_n--;
2771 if (DEBUG_THREAD_SIGNAL)
2772 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2773 aq->agent->device_id, aq->id);
2774 pthread_cond_broadcast (&aq->queue_cond_out);
2775 pthread_mutex_unlock (&aq->mutex);
2777 if (DEBUG_QUEUES)
2778 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2779 aq->id);
2780 pthread_mutex_lock (&aq->mutex);
2782 else
2784 if (DEBUG_THREAD_SLEEP)
2785 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2786 aq->agent->device_id, aq->id);
2787 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2788 if (DEBUG_THREAD_SLEEP)
2789 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2790 aq->agent->device_id, aq->id);
2794 aq->drain_queue_stop = 2;
2795 if (DEBUG_THREAD_SIGNAL)
2796 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2797 aq->agent->device_id, aq->id);
2798 pthread_cond_broadcast (&aq->queue_cond_out);
2799 pthread_mutex_unlock (&aq->mutex);
2801 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2802 return NULL;
2805 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2806 is not usually the case. This is just a debug tool. */
2808 static void
2809 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2811 pthread_mutex_lock (&aq->mutex);
2813 while (aq->queue_n > 0)
2815 execute_queue_entry (aq, aq->queue_first);
2817 aq->queue_first = ((aq->queue_first + 1)
2818 % ASYNC_QUEUE_SIZE);
2819 aq->queue_n--;
2822 pthread_mutex_unlock (&aq->mutex);
2825 /* Block the current thread until an async queue is writable. The aq->mutex
2826 lock should be held on entry, and remains locked on exit. */
2828 static void
2829 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2831 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2833 /* Queue is full. Wait for it to not be full. */
2834 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2835 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2839 /* Request an asynchronous kernel launch on the specified queue. This
2840 may block if the queue is full, but returns without waiting for the
2841 kernel to run. */
2843 static void
2844 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2845 void *vars, struct GOMP_kernel_launch_attributes *kla)
2847 assert (aq->agent == kernel->agent);
2849 pthread_mutex_lock (&aq->mutex);
2851 wait_for_queue_nonfull (aq);
2853 int queue_last = ((aq->queue_first + aq->queue_n)
2854 % ASYNC_QUEUE_SIZE);
2855 if (DEBUG_QUEUES)
2856 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2857 aq->id, queue_last);
2859 aq->queue[queue_last].type = KERNEL_LAUNCH;
2860 aq->queue[queue_last].u.launch.kernel = kernel;
2861 aq->queue[queue_last].u.launch.vars = vars;
2862 aq->queue[queue_last].u.launch.kla = *kla;
2864 aq->queue_n++;
2866 if (DEBUG_THREAD_SIGNAL)
2867 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2868 aq->agent->device_id, aq->id);
2869 pthread_cond_signal (&aq->queue_cond_in);
2871 pthread_mutex_unlock (&aq->mutex);
2874 /* Request an asynchronous callback on the specified queue. The callback
2875 function will be called, with the given opaque data, from the appropriate
2876 async thread, when all previous items on that queue are complete. */
2878 static void
2879 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2880 void *data)
2882 pthread_mutex_lock (&aq->mutex);
2884 wait_for_queue_nonfull (aq);
2886 int queue_last = ((aq->queue_first + aq->queue_n)
2887 % ASYNC_QUEUE_SIZE);
2888 if (DEBUG_QUEUES)
2889 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2890 aq->id, queue_last);
2892 aq->queue[queue_last].type = CALLBACK;
2893 aq->queue[queue_last].u.callback.fn = fn;
2894 aq->queue[queue_last].u.callback.data = data;
2896 aq->queue_n++;
2898 if (DEBUG_THREAD_SIGNAL)
2899 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2900 aq->agent->device_id, aq->id);
2901 pthread_cond_signal (&aq->queue_cond_in);
2903 pthread_mutex_unlock (&aq->mutex);
2906 /* Request that a given async thread wait for another thread (unspecified) to
2907 reach the given placeholder. The wait will occur when all previous entries
2908 on the queue are complete. A placeholder is effectively a kind of signal
2909 which simply sets a flag when encountered in a queue. */
2911 static void
2912 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2913 struct placeholder *placeholderp)
2915 pthread_mutex_lock (&aq->mutex);
2917 wait_for_queue_nonfull (aq);
2919 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2920 if (DEBUG_QUEUES)
2921 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2922 aq->id, queue_last);
2924 aq->queue[queue_last].type = ASYNC_WAIT;
2925 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2927 aq->queue_n++;
2929 if (DEBUG_THREAD_SIGNAL)
2930 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2931 aq->agent->device_id, aq->id);
2932 pthread_cond_signal (&aq->queue_cond_in);
2934 pthread_mutex_unlock (&aq->mutex);
2937 /* Add a placeholder into an async queue. When the async thread reaches the
2938 placeholder it will set the "executed" flag to true and continue.
2939 Another thread may be waiting on this thread reaching the placeholder. */
2941 static struct placeholder *
2942 queue_push_placeholder (struct goacc_asyncqueue *aq)
2944 struct placeholder *placeholderp;
2946 pthread_mutex_lock (&aq->mutex);
2948 wait_for_queue_nonfull (aq);
2950 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2951 if (DEBUG_QUEUES)
2952 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2953 aq->id, queue_last);
2955 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2956 placeholderp = &aq->queue[queue_last].u.placeholder;
2958 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2960 pthread_mutex_unlock (&aq->mutex);
2961 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2964 if (pthread_cond_init (&placeholderp->cond, NULL))
2966 pthread_mutex_unlock (&aq->mutex);
2967 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2970 placeholderp->executed = 0;
2972 aq->queue_n++;
2974 if (DEBUG_THREAD_SIGNAL)
2975 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2976 aq->agent->device_id, aq->id);
2977 pthread_cond_signal (&aq->queue_cond_in);
2979 pthread_mutex_unlock (&aq->mutex);
2981 return placeholderp;
2984 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2986 static void
2987 finalize_async_thread (struct goacc_asyncqueue *aq)
2989 pthread_mutex_lock (&aq->mutex);
2990 if (aq->drain_queue_stop == 2)
2992 pthread_mutex_unlock (&aq->mutex);
2993 return;
2996 aq->drain_queue_stop = 1;
2998 if (DEBUG_THREAD_SIGNAL)
2999 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3000 aq->agent->device_id, aq->id);
3001 pthread_cond_signal (&aq->queue_cond_in);
3003 while (aq->drain_queue_stop != 2)
3005 if (DEBUG_THREAD_SLEEP)
3006 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3007 " to sleep\n", aq->agent->device_id, aq->id);
3008 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3009 if (DEBUG_THREAD_SLEEP)
3010 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3011 aq->agent->device_id, aq->id);
3014 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3015 aq->id);
3016 pthread_mutex_unlock (&aq->mutex);
3018 int err = pthread_join (aq->thread_drain_queue, NULL);
3019 if (err != 0)
3020 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3021 aq->agent->device_id, aq->id, strerror (err));
3022 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3025 /* Set up an async queue for OpenMP. There will be only one. The
3026 implementation simply uses an OpenACC async queue.
3027 FIXME: is this thread-safe if two threads call this function? */
3029 static void
3030 maybe_init_omp_async (struct agent_info *agent)
3032 if (!agent->omp_async_queue)
3033 agent->omp_async_queue
3034 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3037 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3038 copies from read-only pages. */
3040 static void
3041 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3043 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3045 if (status == HSA_STATUS_SUCCESS)
3046 return;
3048 /* It appears that the copy fails if the source data is in a read-only page.
3049 We can't detect that easily, so try copying the data to a temporary buffer
3050 and doing the copy again if we got an error above. */
3052 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3053 "[%p:+%d]\n", (void *) src, (int) len);
3055 void *src_copy = malloc (len);
3056 memcpy (src_copy, src, len);
3057 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3058 free (src_copy);
3059 if (status != HSA_STATUS_SUCCESS)
3060 GOMP_PLUGIN_error ("memory copy failed");
3063 /* Copy data to or from a device. This is intended for use as an async
3064 callback event. */
3066 static void
3067 copy_data (void *data_)
3069 struct copy_data *data = (struct copy_data *)data_;
3070 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3071 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3072 data->dst);
3073 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3074 free (data);
3077 /* Request an asynchronous data copy, to or from a device, on a given queue.
3078 The event will be registered as a callback. */
3080 static void
3081 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3082 size_t len)
3084 if (DEBUG_QUEUES)
3085 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3086 aq->agent->device_id, aq->id, len, src, dst);
3087 struct copy_data *data
3088 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3089 data->dst = dst;
3090 data->src = src;
3091 data->len = len;
3092 data->aq = aq;
3093 queue_push_callback (aq, copy_data, data);
3096 /* Return true if the given queue is currently empty. */
3098 static int
3099 queue_empty (struct goacc_asyncqueue *aq)
3101 pthread_mutex_lock (&aq->mutex);
3102 int res = aq->queue_n == 0 ? 1 : 0;
3103 pthread_mutex_unlock (&aq->mutex);
3105 return res;
3108 /* Wait for a given queue to become empty. This implements an OpenACC wait
3109 directive. */
3111 static void
3112 wait_queue (struct goacc_asyncqueue *aq)
3114 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3116 drain_queue_synchronous (aq);
3117 return;
3120 pthread_mutex_lock (&aq->mutex);
3122 while (aq->queue_n > 0)
3124 if (DEBUG_THREAD_SLEEP)
3125 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3126 aq->agent->device_id, aq->id);
3127 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3128 if (DEBUG_THREAD_SLEEP)
3129 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3130 aq->id);
3133 pthread_mutex_unlock (&aq->mutex);
3134 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3137 /* }}} */
3138 /* {{{ OpenACC support */
3140 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3142 static void
3143 gcn_exec (struct kernel_info *kernel,
3144 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3145 struct goacc_asyncqueue *aq)
3147 if (!GOMP_OFFLOAD_can_run (kernel))
3148 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3150 /* If we get here then this must be an OpenACC kernel. */
3151 kernel->kind = KIND_OPENACC;
3153 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3154 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3156 struct hsa_kernel_description *d
3157 = &kernel->module->image_desc->kernel_infos[i];
3158 if (d->name == kernel->name)
3160 hsa_kernel_desc = d;
3161 break;
3165 /* We may have statically-determined dimensions in
3166 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3167 invocation at runtime in dims[]. We allow static dimensions to take
3168 priority over dynamic dimensions when present (non-zero). */
3169 if (hsa_kernel_desc->oacc_dims[0] > 0)
3170 dims[0] = hsa_kernel_desc->oacc_dims[0];
3171 if (hsa_kernel_desc->oacc_dims[1] > 0)
3172 dims[1] = hsa_kernel_desc->oacc_dims[1];
3173 if (hsa_kernel_desc->oacc_dims[2] > 0)
3174 dims[2] = hsa_kernel_desc->oacc_dims[2];
3176 /* Ideally, when a dimension isn't explicitly specified, we should
3177 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3178 In practice, we tune for peak performance on BabelStream, which
3179 for OpenACC is currently 32 threads per CU. */
3180 if (dims[0] == 0 && dims[1] == 0)
3182 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3183 number. There isn't really a correct answer for this without a clue
3184 about the problem size, so let's do a reasonable number of workers
3185 and gangs. */
3187 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3188 dims[1] = 8; /* Workers. */
3190 else if (dims[0] == 0 && dims[1] > 0)
3192 /* Auto-scale the number of gangs with the requested number of workers. */
3193 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3195 else if (dims[0] > 0 && dims[1] == 0)
3197 /* Auto-scale the number of workers with the requested number of gangs. */
3198 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3199 if (dims[1] == 0)
3200 dims[1] = 1;
3201 if (dims[1] > 16)
3202 dims[1] = 16;
3205 /* The incoming dimensions are expressed in terms of gangs, workers, and
3206 vectors. The HSA dimensions are expressed in terms of "work-items",
3207 which means multiples of vector lanes.
3209 The "grid size" specifies the size of the problem space, and the
3210 "work-group size" specifies how much of that we want a single compute
3211 unit to chew on at once.
3213 The three dimensions do not really correspond to hardware, but the
3214 important thing is that the HSA runtime will launch as many
3215 work-groups as it takes to process the entire grid, and each
3216 work-group will contain as many wave-fronts as it takes to process
3217 the work-items in that group.
3219 Essentially, as long as we set the Y dimension to 64 (the number of
3220 vector lanes in hardware), and the Z group size to the maximum (16),
3221 then we will get the gangs (X) and workers (Z) launched as we expect.
3223 The reason for the apparent reversal of vector and worker dimension
3224 order is to do with the way the run-time distributes work-items across
3225 v1 and v2. */
3226 struct GOMP_kernel_launch_attributes kla =
3228 /* Grid size. */
3229 {dims[0], 64, dims[1]},
3230 /* Work-group size. */
3231 {1, 64, 16}
3234 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3235 acc_prof_info *prof_info = thr->prof_info;
3236 acc_event_info enqueue_launch_event_info;
3237 acc_api_info *api_info = thr->api_info;
3238 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3239 if (profiling_dispatch_p)
3241 prof_info->event_type = acc_ev_enqueue_launch_start;
3243 enqueue_launch_event_info.launch_event.event_type
3244 = prof_info->event_type;
3245 enqueue_launch_event_info.launch_event.valid_bytes
3246 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3247 enqueue_launch_event_info.launch_event.parent_construct
3248 = acc_construct_parallel;
3249 enqueue_launch_event_info.launch_event.implicit = 1;
3250 enqueue_launch_event_info.launch_event.tool_info = NULL;
3251 enqueue_launch_event_info.launch_event.kernel_name
3252 = (char *) kernel->name;
3253 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3254 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3255 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3257 api_info->device_api = acc_device_api_other;
3259 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3260 &enqueue_launch_event_info, api_info);
3263 if (!async)
3264 run_kernel (kernel, devaddrs, &kla, NULL, false);
3265 else
3266 queue_push_launch (aq, kernel, devaddrs, &kla);
3268 if (profiling_dispatch_p)
3270 prof_info->event_type = acc_ev_enqueue_launch_end;
3271 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3272 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3273 &enqueue_launch_event_info,
3274 api_info);
3278 /* }}} */
3279 /* {{{ Generic Plugin API */
3281 /* Return the name of the accelerator, which is "gcn". */
3283 const char *
3284 GOMP_OFFLOAD_get_name (void)
3286 return "gcn";
3289 /* Return the specific capabilities the HSA accelerator have. */
3291 unsigned int
3292 GOMP_OFFLOAD_get_caps (void)
3294 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3295 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3296 | GOMP_OFFLOAD_CAP_OPENACC_200;
3299 /* Identify as GCN accelerator. */
3302 GOMP_OFFLOAD_get_type (void)
3304 return OFFLOAD_TARGET_TYPE_GCN;
3307 /* Return the libgomp version number we're compatible with. There is
3308 no requirement for cross-version compatibility. */
3310 unsigned
3311 GOMP_OFFLOAD_version (void)
3313 return GOMP_VERSION;
3316 /* Return the number of GCN devices on the system. */
3319 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3321 if (!init_hsa_context ())
3322 return 0;
3323 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3324 devices were present. */
3325 if (hsa_context.agent_count > 0
3326 && ((omp_requires_mask
3327 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3328 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3329 return -1;
3330 return hsa_context.agent_count;
3333 /* Initialize device (agent) number N so that it can be used for computation.
3334 Return TRUE on success. */
3336 bool
3337 GOMP_OFFLOAD_init_device (int n)
3339 if (!init_hsa_context ())
3340 return false;
3341 if (n >= hsa_context.agent_count)
3343 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3344 return false;
3346 struct agent_info *agent = &hsa_context.agents[n];
3348 if (agent->initialized)
3349 return true;
3351 agent->device_id = n;
3353 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3355 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3356 return false;
3358 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3360 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3361 return false;
3363 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3365 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3366 return false;
3368 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3370 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3371 return false;
3373 agent->async_queues = NULL;
3374 agent->omp_async_queue = NULL;
3375 agent->ephemeral_memories_list = NULL;
3377 uint32_t queue_size;
3378 hsa_status_t status;
3379 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3380 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3381 &queue_size);
3382 if (status != HSA_STATUS_SUCCESS)
3383 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3384 status);
3386 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3387 &agent->name);
3388 if (status != HSA_STATUS_SUCCESS)
3389 return hsa_error ("Error querying the name of the agent", status);
3391 agent->device_isa = isa_code (agent->name);
3392 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3393 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3395 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3396 &agent->vendor_name);
3397 if (status != HSA_STATUS_SUCCESS)
3398 return hsa_error ("Error querying the vendor name of the agent", status);
3400 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3401 HSA_QUEUE_TYPE_MULTI,
3402 hsa_queue_callback, NULL, UINT32_MAX,
3403 UINT32_MAX, &agent->sync_queue);
3404 if (status != HSA_STATUS_SUCCESS)
3405 return hsa_error ("Error creating command queue", status);
3407 agent->kernarg_region.handle = (uint64_t) -1;
3408 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3409 get_kernarg_memory_region,
3410 &agent->kernarg_region);
3411 if (status != HSA_STATUS_SUCCESS
3412 && status != HSA_STATUS_INFO_BREAK)
3413 hsa_error ("Scanning memory regions failed", status);
3414 if (agent->kernarg_region.handle == (uint64_t) -1)
3416 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3417 "arguments");
3418 return false;
3420 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3421 dump_hsa_region (agent->kernarg_region, NULL);
3423 agent->data_region.handle = (uint64_t) -1;
3424 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3425 get_data_memory_region,
3426 &agent->data_region);
3427 if (status != HSA_STATUS_SUCCESS
3428 && status != HSA_STATUS_INFO_BREAK)
3429 hsa_error ("Scanning memory regions failed", status);
3430 if (agent->data_region.handle == (uint64_t) -1)
3432 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3433 "data");
3434 return false;
3436 GCN_DEBUG ("Selected device data memory region:\n");
3437 dump_hsa_region (agent->data_region, NULL);
3439 GCN_DEBUG ("GCN agent %d initialized\n", n);
3441 agent->initialized = true;
3442 return true;
3445 /* Load GCN object-code module described by struct gcn_image_desc in
3446 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3447 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3448 contain the on-device addresses of the functions for reverse offload. To be
3449 freed by the caller. */
3452 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3453 struct addr_pair **target_table,
3454 uint64_t **rev_fn_table,
3455 uint64_t *host_ind_fn_table)
3457 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3459 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3460 " (expected %u, received %u)",
3461 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3462 return -1;
3465 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3466 struct agent_info *agent;
3467 struct addr_pair *pair;
3468 struct module_info *module;
3469 struct kernel_info *kernel;
3470 int kernel_count = image_desc->kernel_count;
3471 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3472 ? image_desc->ind_func_count : 0;
3473 unsigned var_count = image_desc->global_variable_count;
3474 /* Currently, "others" is a struct of ICVS. */
3475 int other_count = 1;
3477 agent = get_agent_info (ord);
3478 if (!agent)
3479 return -1;
3481 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3483 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3484 return -1;
3486 if (agent->prog_finalized
3487 && !destroy_hsa_program (agent))
3488 return -1;
3490 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3491 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3492 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3493 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3494 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3495 * sizeof (struct addr_pair));
3496 *target_table = pair;
3497 module = (struct module_info *)
3498 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3499 + kernel_count * sizeof (struct kernel_info));
3500 module->image_desc = image_desc;
3501 module->kernel_count = kernel_count;
3502 module->heap = NULL;
3503 module->constructors_run_p = false;
3505 kernel = &module->kernels[0];
3507 /* Allocate memory for kernel dependencies. */
3508 for (unsigned i = 0; i < kernel_count; i++)
3510 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3511 if (!init_basic_kernel_info (kernel, d, agent, module))
3512 return -1;
3513 if (strcmp (d->name, "_init_array") == 0)
3514 module->init_array_func = kernel;
3515 else if (strcmp (d->name, "_fini_array") == 0)
3516 module->fini_array_func = kernel;
3517 else
3519 pair->start = (uintptr_t) kernel;
3520 pair->end = (uintptr_t) (kernel + 1);
3521 pair++;
3523 kernel++;
3526 agent->module = module;
3527 if (pthread_rwlock_unlock (&agent->module_rwlock))
3529 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3530 return -1;
3533 if (!create_and_finalize_hsa_program (agent))
3534 return -1;
3536 if (var_count > 0)
3538 hsa_status_t status;
3539 hsa_executable_symbol_t var_symbol;
3540 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3541 ".offload_var_table",
3542 agent->id,
3543 0, &var_symbol);
3545 if (status != HSA_STATUS_SUCCESS)
3546 hsa_fatal ("Could not find symbol for variable in the code object",
3547 status);
3549 uint64_t var_table_addr;
3550 status = hsa_fns.hsa_executable_symbol_get_info_fn
3551 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3552 &var_table_addr);
3553 if (status != HSA_STATUS_SUCCESS)
3554 hsa_fatal ("Could not extract a variable from its symbol", status);
3556 struct {
3557 uint64_t addr;
3558 uint64_t size;
3559 } var_table[var_count];
3560 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3561 (void*)var_table_addr, sizeof (var_table));
3563 for (unsigned i = 0; i < var_count; i++)
3565 pair->start = var_table[i].addr;
3566 pair->end = var_table[i].addr + var_table[i].size;
3567 GCN_DEBUG ("Found variable at %p with size %lu\n",
3568 (void *)var_table[i].addr, var_table[i].size);
3569 pair++;
3573 if (ind_func_count > 0)
3575 hsa_status_t status;
3577 /* Read indirect function table from image. */
3578 hsa_executable_symbol_t ind_funcs_symbol;
3579 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3580 ".offload_ind_func_table",
3581 agent->id,
3582 0, &ind_funcs_symbol);
3584 if (status != HSA_STATUS_SUCCESS)
3585 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3586 "code object", status);
3588 uint64_t ind_funcs_table_addr;
3589 status = hsa_fns.hsa_executable_symbol_get_info_fn
3590 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3591 &ind_funcs_table_addr);
3592 if (status != HSA_STATUS_SUCCESS)
3593 hsa_fatal ("Could not extract a variable from its symbol", status);
3595 uint64_t ind_funcs_table[ind_func_count];
3596 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3597 (void*) ind_funcs_table_addr,
3598 sizeof (ind_funcs_table));
3600 /* Build host->target address map for indirect functions. */
3601 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3602 for (unsigned i = 0; i < ind_func_count; i++)
3604 ind_fn_map[i * 2] = host_ind_fn_table[i];
3605 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3606 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3607 i, host_ind_fn_table[i], ind_funcs_table[i]);
3609 ind_fn_map[ind_func_count * 2] = 0;
3611 /* Write the map onto the target. */
3612 void *map_target_addr
3613 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3614 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3616 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3617 (void*) ind_fn_map,
3618 sizeof (ind_fn_map));
3620 /* Write address of the map onto the target. */
3621 hsa_executable_symbol_t symbol;
3623 status
3624 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3625 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3626 agent->id, 0, &symbol);
3627 if (status != HSA_STATUS_SUCCESS)
3628 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3629 status);
3631 uint64_t varptr;
3632 uint32_t varsize;
3634 status = hsa_fns.hsa_executable_symbol_get_info_fn
3635 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3636 &varptr);
3637 if (status != HSA_STATUS_SUCCESS)
3638 hsa_fatal ("Could not extract a variable from its symbol", status);
3639 status = hsa_fns.hsa_executable_symbol_get_info_fn
3640 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3641 &varsize);
3642 if (status != HSA_STATUS_SUCCESS)
3643 hsa_fatal ("Could not extract a variable size from its symbol",
3644 status);
3646 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3647 varptr, varsize);
3649 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3650 &map_target_addr,
3651 sizeof (map_target_addr));
3654 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3656 hsa_status_t status;
3657 hsa_executable_symbol_t var_symbol;
3658 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3659 XSTRING (GOMP_ADDITIONAL_ICVS),
3660 agent->id, 0, &var_symbol);
3661 if (status == HSA_STATUS_SUCCESS)
3663 uint64_t varptr;
3664 uint32_t varsize;
3666 status = hsa_fns.hsa_executable_symbol_get_info_fn
3667 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3668 &varptr);
3669 if (status != HSA_STATUS_SUCCESS)
3670 hsa_fatal ("Could not extract a variable from its symbol", status);
3671 status = hsa_fns.hsa_executable_symbol_get_info_fn
3672 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3673 &varsize);
3674 if (status != HSA_STATUS_SUCCESS)
3675 hsa_fatal ("Could not extract a variable size from its symbol",
3676 status);
3678 pair->start = varptr;
3679 pair->end = varptr + varsize;
3681 else
3683 /* The variable was not in this image. */
3684 GCN_DEBUG ("Variable not found in image: %s\n",
3685 XSTRING (GOMP_ADDITIONAL_ICVS));
3686 pair->start = pair->end = 0;
3689 /* Ensure that constructors are run first. */
3690 struct GOMP_kernel_launch_attributes kla =
3691 { 3,
3692 /* Grid size. */
3693 { 1, 64, 1 },
3694 /* Work-group size. */
3695 { 1, 64, 1 }
3698 if (module->init_array_func)
3700 init_kernel (module->init_array_func);
3701 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3703 module->constructors_run_p = true;
3705 /* Don't report kernels that libgomp need not know about. */
3706 if (module->init_array_func)
3707 kernel_count--;
3708 if (module->fini_array_func)
3709 kernel_count--;
3711 if (rev_fn_table != NULL && kernel_count == 0)
3712 *rev_fn_table = NULL;
3713 else if (rev_fn_table != NULL)
3715 hsa_status_t status;
3716 hsa_executable_symbol_t var_symbol;
3717 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3718 ".offload_func_table",
3719 agent->id, 0, &var_symbol);
3720 if (status != HSA_STATUS_SUCCESS)
3721 hsa_fatal ("Could not find symbol for variable in the code object",
3722 status);
3723 uint64_t fn_table_addr;
3724 status = hsa_fns.hsa_executable_symbol_get_info_fn
3725 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3726 &fn_table_addr);
3727 if (status != HSA_STATUS_SUCCESS)
3728 hsa_fatal ("Could not extract a variable from its symbol", status);
3729 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3730 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3731 (void*) fn_table_addr,
3732 kernel_count * sizeof (uint64_t));
3735 return kernel_count + var_count + other_count;
3738 /* Unload GCN object-code module described by struct gcn_image_desc in
3739 TARGET_DATA from agent number N. Return TRUE on success. */
3741 bool
3742 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3744 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3746 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3747 " (expected %u, received %u)",
3748 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3749 return false;
3752 struct agent_info *agent;
3753 agent = get_agent_info (n);
3754 if (!agent)
3755 return false;
3757 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3759 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3760 return false;
3763 if (!agent->module || agent->module->image_desc != target_data)
3765 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3766 "loaded before");
3767 return false;
3770 if (!destroy_module (agent->module, true))
3771 return false;
3772 free (agent->module);
3773 agent->module = NULL;
3774 if (!destroy_hsa_program (agent))
3775 return false;
3776 if (pthread_rwlock_unlock (&agent->module_rwlock))
3778 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3779 return false;
3781 return true;
3784 /* Deinitialize all information and status associated with agent number N. We
3785 do not attempt any synchronization, assuming the user and libgomp will not
3786 attempt deinitialization of a device that is in any way being used at the
3787 same time. Return TRUE on success. */
3789 bool
3790 GOMP_OFFLOAD_fini_device (int n)
3792 struct agent_info *agent = get_agent_info (n);
3793 if (!agent)
3794 return false;
3796 if (!agent->initialized)
3797 return true;
3799 if (agent->omp_async_queue)
3801 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3802 agent->omp_async_queue = NULL;
3805 if (agent->module)
3807 if (!destroy_module (agent->module, false))
3808 return false;
3809 free (agent->module);
3810 agent->module = NULL;
3813 if (!destroy_ephemeral_memories (agent))
3814 return false;
3816 if (!destroy_hsa_program (agent))
3817 return false;
3819 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3820 if (status != HSA_STATUS_SUCCESS)
3821 return hsa_error ("Error destroying command queue", status);
3823 if (pthread_mutex_destroy (&agent->prog_mutex))
3825 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3826 return false;
3828 if (pthread_rwlock_destroy (&agent->module_rwlock))
3830 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3831 return false;
3834 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3836 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3837 return false;
3839 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3841 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3842 return false;
3844 agent->initialized = false;
3845 return true;
3848 /* Return true if the HSA runtime can run function FN_PTR. */
3850 bool
3851 GOMP_OFFLOAD_can_run (void *fn_ptr)
3853 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3855 init_kernel (kernel);
3856 if (kernel->initialization_failed)
3857 goto failure;
3859 return true;
3861 failure:
3862 if (suppress_host_fallback)
3863 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3864 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3865 return false;
3868 /* Allocate memory on device N. */
3870 void *
3871 GOMP_OFFLOAD_alloc (int n, size_t size)
3873 struct agent_info *agent = get_agent_info (n);
3874 return alloc_by_agent (agent, size);
3877 /* Free memory from device N. */
3879 bool
3880 GOMP_OFFLOAD_free (int device, void *ptr)
3882 GCN_DEBUG ("Freeing memory on device %d\n", device);
3884 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3885 if (status != HSA_STATUS_SUCCESS)
3887 hsa_error ("Could not free device memory", status);
3888 return false;
3891 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3892 bool profiling_dispatch_p
3893 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3894 if (profiling_dispatch_p)
3896 acc_prof_info *prof_info = thr->prof_info;
3897 acc_event_info data_event_info;
3898 acc_api_info *api_info = thr->api_info;
3900 prof_info->event_type = acc_ev_free;
3902 data_event_info.data_event.event_type = prof_info->event_type;
3903 data_event_info.data_event.valid_bytes
3904 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3905 data_event_info.data_event.parent_construct
3906 = acc_construct_parallel;
3907 data_event_info.data_event.implicit = 1;
3908 data_event_info.data_event.tool_info = NULL;
3909 data_event_info.data_event.var_name = NULL;
3910 data_event_info.data_event.bytes = 0;
3911 data_event_info.data_event.host_ptr = NULL;
3912 data_event_info.data_event.device_ptr = (void *) ptr;
3914 api_info->device_api = acc_device_api_other;
3916 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3917 api_info);
3920 return true;
3923 /* Copy data from DEVICE to host. */
3925 bool
3926 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3928 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3929 src, dst);
3930 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3931 if (status != HSA_STATUS_SUCCESS)
3932 GOMP_PLUGIN_error ("memory copy failed");
3933 return true;
3936 /* Copy data from host to DEVICE. */
3938 bool
3939 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3941 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3942 device, dst);
3943 hsa_memory_copy_wrapper (dst, src, n);
3944 return true;
3947 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3949 bool
3950 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3952 struct gcn_thread *thread_data = gcn_thread ();
3954 if (thread_data && !async_synchronous_p (thread_data->async))
3956 struct agent_info *agent = get_agent_info (device);
3957 maybe_init_omp_async (agent);
3958 queue_push_copy (agent->omp_async_queue, dst, src, n);
3959 return true;
3962 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3963 device, src, device, dst);
3964 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3965 if (status != HSA_STATUS_SUCCESS)
3966 GOMP_PLUGIN_error ("memory copy failed");
3967 return true;
3970 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3971 measured in bytes. So we have:
3973 dim1_size: number of bytes to copy on innermost dimension ("row")
3974 dim0_len: number of rows to copy
3975 dst: base pointer for destination of copy
3976 dst_offset1_size: innermost row offset (for dest), in bytes
3977 dst_offset0_len: offset, number of rows (for dest)
3978 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3979 src: base pointer for source of copy
3980 src_offset1_size: innermost row offset (for source), in bytes
3981 src_offset0_len: offset, number of rows (for source)
3982 src_dim1_size: whole-array source row length, in bytes (pitch)
3986 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3987 size_t dim0_len, void *dst, size_t dst_offset1_size,
3988 size_t dst_offset0_len, size_t dst_dim1_size,
3989 const void *src, size_t src_offset1_size,
3990 size_t src_offset0_len, size_t src_dim1_size)
3992 if (!hsa_fns.hsa_amd_memory_lock_fn
3993 || !hsa_fns.hsa_amd_memory_unlock_fn
3994 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3995 return -1;
3997 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3998 out quietly if we have anything oddly-aligned rather than letting the
3999 driver raise an error. */
4000 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4001 return -1;
4003 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4004 return -1;
4006 /* Only handle host to device or device to host transfers here. */
4007 if ((dst_ord == -1 && src_ord == -1)
4008 || (dst_ord != -1 && src_ord != -1))
4009 return -1;
4011 hsa_amd_copy_direction_t dir
4012 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4013 hsa_agent_t copy_agent;
4015 /* We need to pin (lock) host memory before we start the transfer. Try to
4016 lock the minimum size necessary, i.e. using partial first/last rows of the
4017 whole array. Something like this:
4019 rows -->
4020 ..............
4021 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4022 o | ++#######+++++ <- whole row
4023 l | ++#######+++++ <- "
4024 s v ++#######..... <- last row apart from trailing remainder
4025 ..............
4027 We could split very large transfers into several rectangular copies, but
4028 that is unimplemented for now. */
4030 size_t bounded_size_host, first_elem_offset_host;
4031 void *host_ptr;
4032 if (dir == hsaHostToDevice)
4034 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4035 first_elem_offset_host = src_offset0_len * src_dim1_size
4036 + src_offset1_size;
4037 host_ptr = (void *) src;
4038 struct agent_info *agent = get_agent_info (dst_ord);
4039 copy_agent = agent->id;
4041 else
4043 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4044 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4045 + dst_offset1_size;
4046 host_ptr = dst;
4047 struct agent_info *agent = get_agent_info (src_ord);
4048 copy_agent = agent->id;
4051 void *agent_ptr;
4053 hsa_status_t status
4054 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4055 bounded_size_host, NULL, 0, &agent_ptr);
4056 /* We can't lock the host memory: don't give up though, we might still be
4057 able to use the slow path in our caller. So, don't make this an
4058 error. */
4059 if (status != HSA_STATUS_SUCCESS)
4060 return -1;
4062 hsa_pitched_ptr_t dstpp, srcpp;
4063 hsa_dim3_t dst_offsets, src_offsets, ranges;
4065 int retval = 1;
4067 hsa_signal_t completion_signal;
4068 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4069 if (status != HSA_STATUS_SUCCESS)
4071 retval = -1;
4072 goto unlock;
4075 if (dir == hsaHostToDevice)
4077 srcpp.base = agent_ptr - first_elem_offset_host;
4078 dstpp.base = dst;
4080 else
4082 srcpp.base = (void *) src;
4083 dstpp.base = agent_ptr - first_elem_offset_host;
4086 srcpp.pitch = src_dim1_size;
4087 srcpp.slice = 0;
4089 src_offsets.x = src_offset1_size;
4090 src_offsets.y = src_offset0_len;
4091 src_offsets.z = 0;
4093 dstpp.pitch = dst_dim1_size;
4094 dstpp.slice = 0;
4096 dst_offsets.x = dst_offset1_size;
4097 dst_offsets.y = dst_offset0_len;
4098 dst_offsets.z = 0;
4100 ranges.x = dim1_size;
4101 ranges.y = dim0_len;
4102 ranges.z = 1;
4104 status
4105 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4106 &src_offsets, &ranges,
4107 copy_agent, dir, 0, NULL,
4108 completion_signal);
4109 /* If the rectangular copy fails, we might still be able to use the slow
4110 path. We need to unlock the host memory though, so don't return
4111 immediately. */
4112 if (status != HSA_STATUS_SUCCESS)
4113 retval = -1;
4114 else
4115 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4116 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4117 HSA_WAIT_STATE_ACTIVE);
4119 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4121 unlock:
4122 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4123 if (status != HSA_STATUS_SUCCESS)
4124 hsa_fatal ("Could not unlock host memory", status);
4126 return retval;
4129 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4130 measured in bytes. So we have:
4132 dim2_size: number of bytes to copy on innermost dimension ("row")
4133 dim1_len: number of rows per slice to copy
4134 dim0_len: number of slices to copy
4135 dst: base pointer for destination of copy
4136 dst_offset2_size: innermost row offset (for dest), in bytes
4137 dst_offset1_len: offset, number of rows (for dest)
4138 dst_offset0_len: offset, number of slices (for dest)
4139 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4140 dst_dim1_len: whole-array number of rows in slice (for dest)
4141 src: base pointer for source of copy
4142 src_offset2_size: innermost row offset (for source), in bytes
4143 src_offset1_len: offset, number of rows (for source)
4144 src_offset0_len: offset, number of slices (for source)
4145 src_dim2_size: whole-array source row length, in bytes (pitch)
4146 src_dim1_len: whole-array number of rows in slice (for source)
4150 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4151 size_t dim1_len, size_t dim0_len, void *dst,
4152 size_t dst_offset2_size, size_t dst_offset1_len,
4153 size_t dst_offset0_len, size_t dst_dim2_size,
4154 size_t dst_dim1_len, const void *src,
4155 size_t src_offset2_size, size_t src_offset1_len,
4156 size_t src_offset0_len, size_t src_dim2_size,
4157 size_t src_dim1_len)
4159 if (!hsa_fns.hsa_amd_memory_lock_fn
4160 || !hsa_fns.hsa_amd_memory_unlock_fn
4161 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4162 return -1;
4164 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4165 out quietly if we have anything oddly-aligned rather than letting the
4166 driver raise an error. */
4167 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4168 return -1;
4170 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4171 return -1;
4173 /* Only handle host to device or device to host transfers here. */
4174 if ((dst_ord == -1 && src_ord == -1)
4175 || (dst_ord != -1 && src_ord != -1))
4176 return -1;
4178 hsa_amd_copy_direction_t dir
4179 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4180 hsa_agent_t copy_agent;
4182 /* We need to pin (lock) host memory before we start the transfer. Try to
4183 lock the minimum size necessary, i.e. using partial first/last slices of
4184 the whole 3D array. Something like this:
4186 slice 0: slice 1: slice 2:
4187 __________ __________ __________
4188 ^ /+++++++++/ : /+++++++++/ : / /
4189 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4190 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4191 /_________/ : /_________/ : /_________/
4192 row --->
4194 We could split very large transfers into several rectangular copies, but
4195 that is unimplemented for now. */
4197 size_t bounded_size_host, first_elem_offset_host;
4198 void *host_ptr;
4199 if (dir == hsaHostToDevice)
4201 size_t slice_bytes = src_dim2_size * src_dim1_len;
4202 bounded_size_host = slice_bytes * (dim0_len - 1)
4203 + src_dim2_size * (dim1_len - 1)
4204 + dim2_size;
4205 first_elem_offset_host = src_offset0_len * slice_bytes
4206 + src_offset1_len * src_dim2_size
4207 + src_offset2_size;
4208 host_ptr = (void *) src;
4209 struct agent_info *agent = get_agent_info (dst_ord);
4210 copy_agent = agent->id;
4212 else
4214 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4215 bounded_size_host = slice_bytes * (dim0_len - 1)
4216 + dst_dim2_size * (dim1_len - 1)
4217 + dim2_size;
4218 first_elem_offset_host = dst_offset0_len * slice_bytes
4219 + dst_offset1_len * dst_dim2_size
4220 + dst_offset2_size;
4221 host_ptr = dst;
4222 struct agent_info *agent = get_agent_info (src_ord);
4223 copy_agent = agent->id;
4226 void *agent_ptr;
4228 hsa_status_t status
4229 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4230 bounded_size_host, NULL, 0, &agent_ptr);
4231 /* We can't lock the host memory: don't give up though, we might still be
4232 able to use the slow path in our caller (maybe even with iterated memcpy2d
4233 calls). So, don't make this an error. */
4234 if (status != HSA_STATUS_SUCCESS)
4235 return -1;
4237 hsa_pitched_ptr_t dstpp, srcpp;
4238 hsa_dim3_t dst_offsets, src_offsets, ranges;
4240 int retval = 1;
4242 hsa_signal_t completion_signal;
4243 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4244 if (status != HSA_STATUS_SUCCESS)
4246 retval = -1;
4247 goto unlock;
4250 if (dir == hsaHostToDevice)
4252 srcpp.base = agent_ptr - first_elem_offset_host;
4253 dstpp.base = dst;
4255 else
4257 srcpp.base = (void *) src;
4258 dstpp.base = agent_ptr - first_elem_offset_host;
4261 /* Pitch is measured in bytes. */
4262 srcpp.pitch = src_dim2_size;
4263 /* Slice is also measured in bytes (i.e. total per-slice). */
4264 srcpp.slice = src_dim2_size * src_dim1_len;
4266 src_offsets.x = src_offset2_size;
4267 src_offsets.y = src_offset1_len;
4268 src_offsets.z = src_offset0_len;
4270 /* As above. */
4271 dstpp.pitch = dst_dim2_size;
4272 dstpp.slice = dst_dim2_size * dst_dim1_len;
4274 dst_offsets.x = dst_offset2_size;
4275 dst_offsets.y = dst_offset1_len;
4276 dst_offsets.z = dst_offset0_len;
4278 ranges.x = dim2_size;
4279 ranges.y = dim1_len;
4280 ranges.z = dim0_len;
4282 status
4283 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4284 &src_offsets, &ranges,
4285 copy_agent, dir, 0, NULL,
4286 completion_signal);
4287 /* If the rectangular copy fails, we might still be able to use the slow
4288 path. We need to unlock the host memory though, so don't return
4289 immediately. */
4290 if (status != HSA_STATUS_SUCCESS)
4291 retval = -1;
4292 else
4294 hsa_signal_value_t sv
4295 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4296 HSA_SIGNAL_CONDITION_LT, 1,
4297 UINT64_MAX,
4298 HSA_WAIT_STATE_ACTIVE);
4299 if (sv < 0)
4301 GCN_WARNING ("async copy rect failure");
4302 retval = -1;
4306 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4308 unlock:
4309 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4310 if (status != HSA_STATUS_SUCCESS)
4311 hsa_fatal ("Could not unlock host memory", status);
4313 return retval;
4316 /* }}} */
4317 /* {{{ OpenMP Plugin API */
4319 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4320 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4321 to a kernel_info structure, and must have previously been loaded to the
4322 specified device. */
4324 void
4325 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4327 struct agent_info *agent = get_agent_info (device);
4328 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4329 struct GOMP_kernel_launch_attributes def;
4330 struct GOMP_kernel_launch_attributes *kla;
4331 assert (agent == kernel->agent);
4333 /* If we get here then the kernel must be OpenMP. */
4334 kernel->kind = KIND_OPENMP;
4336 if (!parse_target_attributes (args, &def, &kla, agent))
4338 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4339 return;
4341 run_kernel (kernel, vars, kla, NULL, false);
4344 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4345 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4346 GOMP_PLUGIN_target_task_completion when it has finished. */
4348 void
4349 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4350 void **args, void *async_data)
4352 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4353 struct agent_info *agent = get_agent_info (device);
4354 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4355 struct GOMP_kernel_launch_attributes def;
4356 struct GOMP_kernel_launch_attributes *kla;
4357 assert (agent == kernel->agent);
4359 /* If we get here then the kernel must be OpenMP. */
4360 kernel->kind = KIND_OPENMP;
4362 if (!parse_target_attributes (args, &def, &kla, agent))
4364 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4365 return;
4368 maybe_init_omp_async (agent);
4369 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4370 queue_push_callback (agent->omp_async_queue,
4371 GOMP_PLUGIN_target_task_completion, async_data);
4374 /* }}} */
4375 /* {{{ OpenACC Plugin API */
4377 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4378 already-loaded KERNEL. */
4380 void
4381 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4382 size_t mapnum __attribute__((unused)),
4383 void **hostaddrs __attribute__((unused)),
4384 void **devaddrs, unsigned *dims,
4385 void *targ_mem_desc)
4387 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4389 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4392 /* Run an asynchronous OpenACC kernel on the specified queue. */
4394 void
4395 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4396 size_t mapnum __attribute__((unused)),
4397 void **hostaddrs __attribute__((unused)),
4398 void **devaddrs,
4399 unsigned *dims, void *targ_mem_desc,
4400 struct goacc_asyncqueue *aq)
4402 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4404 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4407 /* Create a new asynchronous thread and queue for running future kernels. */
4409 struct goacc_asyncqueue *
4410 GOMP_OFFLOAD_openacc_async_construct (int device)
4412 struct agent_info *agent = get_agent_info (device);
4414 pthread_mutex_lock (&agent->async_queues_mutex);
4416 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4417 aq->agent = get_agent_info (device);
4418 aq->prev = NULL;
4419 aq->next = agent->async_queues;
4420 if (aq->next)
4422 aq->next->prev = aq;
4423 aq->id = aq->next->id + 1;
4425 else
4426 aq->id = 1;
4427 agent->async_queues = aq;
4429 aq->queue_first = 0;
4430 aq->queue_n = 0;
4431 aq->drain_queue_stop = 0;
4433 if (pthread_mutex_init (&aq->mutex, NULL))
4435 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4436 return false;
4438 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4440 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4441 return false;
4443 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4445 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4446 return false;
4449 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4450 ASYNC_QUEUE_SIZE,
4451 HSA_QUEUE_TYPE_MULTI,
4452 hsa_queue_callback, NULL,
4453 UINT32_MAX, UINT32_MAX,
4454 &aq->hsa_queue);
4455 if (status != HSA_STATUS_SUCCESS)
4456 hsa_fatal ("Error creating command queue", status);
4458 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4459 if (err != 0)
4460 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4461 strerror (err));
4462 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4463 aq->id);
4465 pthread_mutex_unlock (&agent->async_queues_mutex);
4467 return aq;
4470 /* Destroy an existing asynchronous thread and queue. Waits for any
4471 currently-running task to complete, but cancels any queued tasks. */
4473 bool
4474 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4476 struct agent_info *agent = aq->agent;
4478 finalize_async_thread (aq);
4480 pthread_mutex_lock (&agent->async_queues_mutex);
4482 int err;
4483 if ((err = pthread_mutex_destroy (&aq->mutex)))
4485 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4486 goto fail;
4488 if (pthread_cond_destroy (&aq->queue_cond_in))
4490 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4491 goto fail;
4493 if (pthread_cond_destroy (&aq->queue_cond_out))
4495 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4496 goto fail;
4498 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4499 if (status != HSA_STATUS_SUCCESS)
4501 hsa_error ("Error destroying command queue", status);
4502 goto fail;
4505 if (aq->prev)
4506 aq->prev->next = aq->next;
4507 if (aq->next)
4508 aq->next->prev = aq->prev;
4509 if (agent->async_queues == aq)
4510 agent->async_queues = aq->next;
4512 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4514 free (aq);
4515 pthread_mutex_unlock (&agent->async_queues_mutex);
4516 return true;
4518 fail:
4519 pthread_mutex_unlock (&agent->async_queues_mutex);
4520 return false;
4523 /* Return true if the specified async queue is currently empty. */
4526 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4528 return queue_empty (aq);
4531 /* Block until the specified queue has executed all its tasks and the
4532 queue is empty. */
4534 bool
4535 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4537 wait_queue (aq);
4538 return true;
4541 /* Add a serialization point across two async queues. Any new tasks added to
4542 AQ2, after this call, will not run until all tasks on AQ1, at the time
4543 of this call, have completed. */
4545 bool
4546 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4547 struct goacc_asyncqueue *aq2)
4549 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4550 scheduled to run on it up to this point. */
4551 if (aq1 != aq2)
4553 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4554 queue_push_asyncwait (aq2, placeholderp);
4556 return true;
4559 /* Add an opaque callback to the given async queue. */
4561 void
4562 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4563 void (*fn) (void *), void *data)
4565 queue_push_callback (aq, fn, data);
4568 /* Queue up an asynchronous data copy from host to DEVICE. */
4570 bool
4571 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4572 size_t n, struct goacc_asyncqueue *aq)
4574 struct agent_info *agent = get_agent_info (device);
4575 assert (agent == aq->agent);
4576 queue_push_copy (aq, dst, src, n);
4577 return true;
4580 /* Queue up an asynchronous data copy from DEVICE to host. */
4582 bool
4583 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4584 size_t n, struct goacc_asyncqueue *aq)
4586 struct agent_info *agent = get_agent_info (device);
4587 assert (agent == aq->agent);
4588 queue_push_copy (aq, dst, src, n);
4589 return true;
4592 union goacc_property_value
4593 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4595 struct agent_info *agent = get_agent_info (device);
4597 union goacc_property_value propval = { .val = 0 };
4599 switch (prop)
4601 case GOACC_PROPERTY_FREE_MEMORY:
4602 /* Not supported. */
4603 break;
4604 case GOACC_PROPERTY_MEMORY:
4606 size_t size;
4607 hsa_region_t region = agent->data_region;
4608 hsa_status_t status =
4609 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4610 if (status == HSA_STATUS_SUCCESS)
4611 propval.val = size;
4612 break;
4614 case GOACC_PROPERTY_NAME:
4615 propval.ptr = agent->name;
4616 break;
4617 case GOACC_PROPERTY_VENDOR:
4618 propval.ptr = agent->vendor_name;
4619 break;
4620 case GOACC_PROPERTY_DRIVER:
4621 propval.ptr = hsa_context.driver_version_s;
4622 break;
4625 return propval;
4628 /* Set up plugin-specific thread-local-data (host-side). */
4630 void *
4631 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4633 struct gcn_thread *thread_data
4634 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4636 thread_data->async = GOMP_ASYNC_SYNC;
4638 return (void *) thread_data;
4641 /* Clean up plugin-specific thread-local-data. */
4643 void
4644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4646 free (data);
4649 /* }}} */