1 /* Run a stand-alone AMD GCN kernel.
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
21 The kernel entry point's signature must use a standard main signature:
23 int main(int argc, char **argv)
37 /* These probably won't be in elf.h for a while. */
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
67 hsa_agent_t device
= { 0 };
68 hsa_queue_t
*queue
= NULL
;
70 hsa_executable_t executable
= { 0 };
72 hsa_region_t kernargs_region
= { 0 };
73 uint32_t kernarg_segment_size
= 0;
74 uint32_t group_segment_size
= 0;
75 uint32_t private_segment_size
= 0;
78 usage (const char *progname
)
80 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
84 " --debug\n", progname
);
88 version (const char *progname
)
90 printf ("%s " VERSION_STRING
"\n", progname
);
93 /* As an HSA runtime is dlopened, following structure defines the necessary
95 Code adapted from libgomp. */
97 struct hsa_runtime_fn_info
100 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
101 const char **status_string
);
102 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
103 hsa_agent_info_t attribute
,
105 hsa_status_t (*hsa_init_fn
) (void);
106 hsa_status_t (*hsa_iterate_agents_fn
)
107 (hsa_status_t (*callback
) (hsa_agent_t agent
, void *data
), void *data
);
108 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
109 hsa_region_info_t attribute
,
111 hsa_status_t (*hsa_queue_create_fn
)
112 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
113 void (*callback
) (hsa_status_t status
, hsa_queue_t
*source
, void *data
),
114 void *data
, uint32_t private_segment_size
,
115 uint32_t group_segment_size
, hsa_queue_t
**queue
);
116 hsa_status_t (*hsa_agent_iterate_regions_fn
)
118 hsa_status_t (*callback
) (hsa_region_t region
, void *data
), void *data
);
119 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
120 hsa_status_t (*hsa_executable_create_fn
)
121 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
122 const char *options
, hsa_executable_t
*executable
);
123 hsa_status_t (*hsa_executable_global_variable_define_fn
)
124 (hsa_executable_t executable
, const char *variable_name
, void *address
);
125 hsa_status_t (*hsa_executable_load_code_object_fn
)
126 (hsa_executable_t executable
, hsa_agent_t agent
,
127 hsa_code_object_t code_object
, const char *options
);
128 hsa_status_t (*hsa_executable_freeze_fn
) (hsa_executable_t executable
,
129 const char *options
);
130 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
131 uint32_t num_consumers
,
132 const hsa_agent_t
*consumers
,
133 hsa_signal_t
*signal
);
134 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
136 hsa_status_t (*hsa_memory_copy_fn
) (void *dst
, const void *src
,
138 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
139 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
140 hsa_status_t (*hsa_executable_get_symbol_fn
)
141 (hsa_executable_t executable
, const char *module_name
,
142 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
143 hsa_executable_symbol_t
*symbol
);
144 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
145 (hsa_executable_symbol_t executable_symbol
,
146 hsa_executable_symbol_info_t attribute
, void *value
);
147 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
148 hsa_signal_value_t value
);
149 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
150 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
151 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
152 hsa_wait_state_t wait_state_hint
);
153 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn
)
154 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
155 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
156 hsa_wait_state_t wait_state_hint
);
157 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
158 hsa_status_t (*hsa_code_object_deserialize_fn
)
159 (void *serialized_code_object
, size_t serialized_code_object_size
,
160 const char *options
, hsa_code_object_t
*code_object
);
161 uint64_t (*hsa_queue_load_write_index_relaxed_fn
)
162 (const hsa_queue_t
*queue
);
163 void (*hsa_queue_store_write_index_relaxed_fn
)
164 (const hsa_queue_t
*queue
, uint64_t value
);
165 hsa_status_t (*hsa_shut_down_fn
) ();
168 /* HSA runtime functions that are initialized in init_hsa_context.
169 Code adapted from libgomp. */
171 static struct hsa_runtime_fn_info hsa_fns
;
173 #define DLSYM_FN(function) \
174 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
175 if (hsa_fns.function##_fn == NULL) \
179 init_hsa_runtime_functions (void)
181 void *handle
= dlopen (HSA_RUNTIME_LIB
, RTLD_LAZY
);
185 "The HSA runtime is required to run GCN kernels on hardware.\n"
186 "%s: File not found or could not be opened\n",
191 DLSYM_FN (hsa_status_string
)
192 DLSYM_FN (hsa_agent_get_info
)
194 DLSYM_FN (hsa_iterate_agents
)
195 DLSYM_FN (hsa_region_get_info
)
196 DLSYM_FN (hsa_queue_create
)
197 DLSYM_FN (hsa_agent_iterate_regions
)
198 DLSYM_FN (hsa_executable_destroy
)
199 DLSYM_FN (hsa_executable_create
)
200 DLSYM_FN (hsa_executable_global_variable_define
)
201 DLSYM_FN (hsa_executable_load_code_object
)
202 DLSYM_FN (hsa_executable_freeze
)
203 DLSYM_FN (hsa_signal_create
)
204 DLSYM_FN (hsa_memory_allocate
)
205 DLSYM_FN (hsa_memory_copy
)
206 DLSYM_FN (hsa_memory_free
)
207 DLSYM_FN (hsa_signal_destroy
)
208 DLSYM_FN (hsa_executable_get_symbol
)
209 DLSYM_FN (hsa_executable_symbol_get_info
)
210 DLSYM_FN (hsa_signal_wait_acquire
)
211 DLSYM_FN (hsa_signal_wait_relaxed
)
212 DLSYM_FN (hsa_signal_store_relaxed
)
213 DLSYM_FN (hsa_queue_destroy
)
214 DLSYM_FN (hsa_code_object_deserialize
)
215 DLSYM_FN (hsa_queue_load_write_index_relaxed
)
216 DLSYM_FN (hsa_queue_store_write_index_relaxed
)
217 DLSYM_FN (hsa_shut_down
)
222 fprintf (stderr
, "Failed to find HSA functions in " HSA_RUNTIME_LIB
"\n");
228 /* Report a fatal error STR together with the HSA error corresponding to
229 STATUS and terminate execution of the current process. */
232 hsa_fatal (const char *str
, hsa_status_t status
)
234 const char *hsa_error_msg
;
235 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
236 fprintf (stderr
, "%s: FAILED\nHSA Runtime message: %s\n", str
,
241 /* Helper macros to ensure we check the return values from the HSA Runtime.
242 These just keep the rest of the code a bit cleaner. */
244 #define XHSA_CMP(FN, CMP, MSG) \
246 hsa_status_t status = (FN); \
248 hsa_fatal ((MSG), status); \
250 fprintf (stderr, "%s: OK\n", (MSG)); \
252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
254 /* Callback of hsa_iterate_agents.
255 Called once for each available device, and returns "break" when a
256 suitable one has been found. */
259 get_gpu_agent (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
261 hsa_device_type_t device_type
;
262 XHSA (hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
266 /* Select only GPU devices. */
267 /* TODO: support selecting from multiple GPUs. */
268 if (HSA_DEVICE_TYPE_GPU
== device_type
)
271 return HSA_STATUS_INFO_BREAK
;
274 /* The device was not suitable. */
275 return HSA_STATUS_SUCCESS
;
278 /* Callback of hsa_iterate_regions.
279 Called once for each available memory region, and returns "break" when a
280 suitable one has been found. */
283 get_kernarg_region (hsa_region_t region
, void *data
__attribute__ ((unused
)))
285 /* Reject non-global regions. */
286 hsa_region_segment_t segment
;
287 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
, &segment
);
288 if (HSA_REGION_SEGMENT_GLOBAL
!= segment
)
289 return HSA_STATUS_SUCCESS
;
291 /* Find a region with the KERNARG flag set. */
292 hsa_region_global_flag_t flags
;
293 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
295 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
297 kernargs_region
= region
;
298 return HSA_STATUS_INFO_BREAK
;
301 /* The region was not suitable. */
302 return HSA_STATUS_SUCCESS
;
305 /* Initialize the HSA Runtime library and GPU device. */
310 /* Load the shared library and find the API functions. */
311 init_hsa_runtime_functions ();
313 /* Initialize the HSA Runtime. */
314 XHSA (hsa_fns
.hsa_init_fn (),
315 "Initialize run-time");
317 /* Select a suitable device.
318 The call-back function, get_gpu_agent, does the selection. */
319 XHSA_CMP (hsa_fns
.hsa_iterate_agents_fn (get_gpu_agent
, NULL
),
320 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
323 /* Initialize the queue used for launching kernels. */
324 uint32_t queue_size
= 0;
325 XHSA (hsa_fns
.hsa_agent_get_info_fn (device
, HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
327 "Find max queue size");
328 XHSA (hsa_fns
.hsa_queue_create_fn (device
, queue_size
,
329 HSA_QUEUE_TYPE_SINGLE
, NULL
,
330 NULL
, UINT32_MAX
, UINT32_MAX
, &queue
),
331 "Set up a device queue");
333 /* Select a memory region for the kernel arguments.
334 The call-back function, get_kernarg_region, does the selection. */
335 XHSA_CMP (hsa_fns
.hsa_agent_iterate_regions_fn (device
, get_kernarg_region
,
337 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
338 "Locate kernargs memory");
342 /* Read a whole input file.
343 Code copied from mkoffload. */
346 read_file (const char *filename
, size_t *plen
)
348 size_t alloc
= 16384;
352 FILE *stream
= fopen (filename
, "rb");
359 if (!fseek (stream
, 0, SEEK_END
))
361 /* Get the file size. */
362 long s
= ftell (stream
);
365 fseek (stream
, 0, SEEK_SET
);
367 buffer
= malloc (alloc
);
371 size_t n
= fread (buffer
+ base
, 1, alloc
- base
- 1, stream
);
376 if (base
+ 1 == alloc
)
379 buffer
= realloc (buffer
, alloc
);
390 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
393 load_image (const char *filename
)
396 Elf64_Ehdr
*image
= (void *) read_file (filename
, &image_size
);
398 /* An "executable" consists of one or more code objects. */
399 XHSA (hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
400 HSA_EXECUTABLE_STATE_UNFROZEN
, "",
402 "Initialize GCN executable");
404 /* Hide relocations from the HSA runtime loader.
405 Keep a copy of the unmodified section headers to use later. */
406 Elf64_Shdr
*image_sections
=
407 (Elf64_Shdr
*) ((char *) image
+ image
->e_shoff
);
408 Elf64_Shdr
*sections
= malloc (sizeof (Elf64_Shdr
) * image
->e_shnum
);
409 memcpy (sections
, image_sections
, sizeof (Elf64_Shdr
) * image
->e_shnum
);
410 for (int i
= image
->e_shnum
- 1; i
>= 0; i
--)
412 if (image_sections
[i
].sh_type
== SHT_RELA
413 || image_sections
[i
].sh_type
== SHT_REL
)
414 /* Change section type to something harmless. */
415 image_sections
[i
].sh_type
= SHT_NOTE
;
418 /* Add the HSACO to the executable. */
419 hsa_code_object_t co
= { 0 };
420 XHSA (hsa_fns
.hsa_code_object_deserialize_fn (image
, image_size
, NULL
, &co
),
421 "Deserialize GCN code object");
422 XHSA (hsa_fns
.hsa_executable_load_code_object_fn (executable
, device
, co
,
424 "Load GCN code object");
426 /* We're done modifying he executable. */
427 XHSA (hsa_fns
.hsa_executable_freeze_fn (executable
, ""),
428 "Freeze GCN executable");
430 /* Locate the "main" function, and read the kernel's properties. */
431 hsa_executable_symbol_t symbol
;
432 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "main",
434 "Find 'main' function");
435 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
436 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
),
437 "Extract kernel object");
438 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
439 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
440 &kernarg_segment_size
),
441 "Extract kernarg segment size");
442 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
443 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
444 &group_segment_size
),
445 "Extract group segment size");
446 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
447 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
448 &private_segment_size
),
449 "Extract private segment size");
451 /* Find main function in ELF, and calculate actual load offset. */
452 Elf64_Addr load_offset
;
453 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
454 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
456 "Extract 'main' symbol address");
457 for (int i
= 0; i
< image
->e_shnum
; i
++)
458 if (sections
[i
].sh_type
== SHT_SYMTAB
)
460 Elf64_Shdr
*strtab
= §ions
[sections
[i
].sh_link
];
461 char *strings
= (char *) image
+ strtab
->sh_offset
;
463 for (size_t offset
= 0;
464 offset
< sections
[i
].sh_size
;
465 offset
+= sections
[i
].sh_entsize
)
467 Elf64_Sym
*sym
= (Elf64_Sym
*) ((char *) image
468 + sections
[i
].sh_offset
+ offset
);
469 if (strcmp ("main", strings
+ sym
->st_name
) == 0)
471 load_offset
-= sym
->st_value
;
476 /* We only get here when main was not found.
477 This should never happen. */
478 fprintf (stderr
, "Error: main function not found.\n");
482 /* Find dynamic symbol table. */
483 Elf64_Shdr
*dynsym
= NULL
;
484 for (int i
= 0; i
< image
->e_shnum
; i
++)
485 if (sections
[i
].sh_type
== SHT_DYNSYM
)
487 dynsym
= §ions
[i
];
491 /* Fix up relocations. */
492 for (int i
= 0; i
< image
->e_shnum
; i
++)
494 if (sections
[i
].sh_type
== SHT_RELA
)
495 for (size_t offset
= 0;
496 offset
< sections
[i
].sh_size
;
497 offset
+= sections
[i
].sh_entsize
)
499 Elf64_Rela
*reloc
= (Elf64_Rela
*) ((char *) image
500 + sections
[i
].sh_offset
504 ? (Elf64_Sym
*) ((char *) image
506 + (dynsym
->sh_entsize
507 * ELF64_R_SYM (reloc
->r_info
))) : NULL
);
509 int64_t S
= (sym
? sym
->st_value
: 0);
510 int64_t P
= reloc
->r_offset
+ load_offset
;
511 int64_t A
= reloc
->r_addend
;
512 int64_t B
= load_offset
;
514 switch (ELF64_R_TYPE (reloc
->r_info
))
516 case R_AMDGPU_ABS32_LO
:
517 V
= (S
+ A
) & 0xFFFFFFFF;
520 case R_AMDGPU_ABS32_HI
:
534 LLD seems to emit REL64 where the the assembler has ABS64.
535 This is clearly wrong because it's not what the compiler
536 is expecting. Let's assume, for now, that it's a bug.
537 In any case, GCN kernels are always self contained and
538 therefore relative relocations will have been resolved
539 already, so this should be a safe workaround. */
540 V
= S
+ A
/* - P */ ;
547 /* TODO R_AMDGPU_GOTPCREL */
548 /* TODO R_AMDGPU_GOTPCREL32_LO */
549 /* TODO R_AMDGPU_GOTPCREL32_HI */
550 case R_AMDGPU_REL32_LO
:
551 V
= (S
+ A
- P
) & 0xFFFFFFFF;
554 case R_AMDGPU_REL32_HI
:
555 V
= (S
+ A
- P
) >> 32;
558 case R_AMDGPU_RELATIVE64
:
563 fprintf (stderr
, "Error: unsupported relocation type.\n");
566 XHSA (hsa_fns
.hsa_memory_copy_fn ((void *) P
, &V
, size
),
567 "Fix up relocation");
572 /* Allocate some device memory from the kernargs region.
573 The returned address will be 32-bit (with excess zeroed on 64-bit host),
574 and accessible via the same address on both host and target (via
575 __flat_scalar GCN address space). */
578 device_malloc (size_t size
)
581 XHSA (hsa_fns
.hsa_memory_allocate_fn (kernargs_region
, size
, &result
),
582 "Allocate device memory");
586 /* These are the device pointers that will be transferred to the target.
587 The HSA Runtime points the kernargs register here.
588 They correspond to function signature:
589 int main (int argc, char *argv[], int *return_value)
590 The compiler expects this, for kernel functions, and will
591 automatically assign the exit value to *return_value. */
626 /* Print any console output from the kernel.
627 We print all entries from print_index to the next entry without a "written"
628 flag. Subsequent calls should use the returned print_index value to resume
629 from the same point. */
631 gomp_print_output (struct kernargs
*kernargs
, int *print_index
)
633 int limit
= (sizeof (kernargs
->output_data
.queue
)
634 / sizeof (kernargs
->output_data
.queue
[0]));
637 for (i
= *print_index
; i
< limit
; i
++)
639 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
];
647 printf ("%.128s%ld\n", data
->msg
, data
->ivalue
);
650 printf ("%.128s%f\n", data
->msg
, data
->dvalue
);
653 printf ("%.128s%.128s\n", data
->msg
, data
->text
);
656 printf ("%.128s%.128s", data
->msg
, data
->text
);
663 if (*print_index
< limit
&& i
== limit
664 && kernargs
->output_data
.next_output
> limit
)
665 printf ("WARNING: GCN print buffer exhausted.\n");
670 /* Execute an already-loaded kernel on the device. */
675 /* A "signal" is used to launch and monitor the kernel. */
677 XHSA (hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &signal
),
680 /* Configure for a single-worker kernel. */
681 uint64_t index
= hsa_fns
.hsa_queue_load_write_index_relaxed_fn (queue
);
682 const uint32_t queueMask
= queue
->size
- 1;
683 hsa_kernel_dispatch_packet_t
*dispatch_packet
=
684 &(((hsa_kernel_dispatch_packet_t
*) (queue
->base_address
))[index
&
686 dispatch_packet
->setup
|= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
687 dispatch_packet
->workgroup_size_x
= (uint16_t) 1;
688 dispatch_packet
->workgroup_size_y
= (uint16_t) 64;
689 dispatch_packet
->workgroup_size_z
= (uint16_t) 1;
690 dispatch_packet
->grid_size_x
= 1;
691 dispatch_packet
->grid_size_y
= 64;
692 dispatch_packet
->grid_size_z
= 1;
693 dispatch_packet
->completion_signal
= signal
;
694 dispatch_packet
->kernel_object
= kernel
;
695 dispatch_packet
->kernarg_address
= (void *) kernargs
;
696 dispatch_packet
->private_segment_size
= private_segment_size
;
697 dispatch_packet
->group_segment_size
= group_segment_size
;
700 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
701 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
702 header
|= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
704 __atomic_store_n ((uint32_t *) dispatch_packet
,
705 header
| (dispatch_packet
->setup
<< 16),
709 fprintf (stderr
, "Launch kernel\n");
711 hsa_fns
.hsa_queue_store_write_index_relaxed_fn (queue
, index
+ 1);
712 hsa_fns
.hsa_signal_store_relaxed_fn (queue
->doorbell_signal
, index
);
713 /* Kernel running ...... */
715 while (hsa_fns
.hsa_signal_wait_relaxed_fn (signal
, HSA_SIGNAL_CONDITION_LT
,
717 HSA_WAIT_STATE_ACTIVE
) != 0)
720 gomp_print_output (kernargs
, &print_index
);
723 gomp_print_output (kernargs
, &print_index
);
726 fprintf (stderr
, "Kernel exited\n");
728 XHSA (hsa_fns
.hsa_signal_destroy_fn (signal
),
733 main (int argc
, char *argv
[])
736 for (int i
= 1; i
< argc
; i
++)
738 if (!strcmp (argv
[i
], "--help"))
743 else if (!strcmp (argv
[i
], "--version"))
748 else if (!strcmp (argv
[i
], "--debug"))
750 else if (argv
[i
][0] == '-')
764 /* No kernel arguments were found. */
769 /* The remaining arguments are for the GCN kernel. */
770 int kernel_argc
= argc
- kernel_arg
;
771 char **kernel_argv
= &argv
[kernel_arg
];
774 load_image (kernel_argv
[0]);
776 /* Calculate size of function parameters + argv data. */
777 size_t args_size
= 0;
778 for (int i
= 0; i
< kernel_argc
; i
++)
779 args_size
+= strlen (kernel_argv
[i
]) + 1;
781 /* Allocate device memory for both function parameters and the argv
783 size_t heap_size
= 10 * 1024 * 1024; /* 10MB. */
784 struct kernargs
*kernargs
= device_malloc (sizeof (*kernargs
) + heap_size
);
787 int64_t argv_data
[kernel_argc
];
788 char strings
[args_size
];
789 } *args
= device_malloc (sizeof (struct argdata
));
791 /* Write the data to the target. */
792 kernargs
->argc
= kernel_argc
;
793 kernargs
->argv
= (int64_t) args
->argv_data
;
794 kernargs
->out_ptr
= (int64_t) &kernargs
->output_data
;
795 kernargs
->output_data
.return_value
= 0xcafe0000; /* Default return value. */
796 kernargs
->output_data
.next_output
= 0;
797 for (unsigned i
= 0; i
< (sizeof (kernargs
->output_data
.queue
)
798 / sizeof (kernargs
->output_data
.queue
[0])); i
++)
799 kernargs
->output_data
.queue
[i
].written
= 0;
801 for (int i
= 0; i
< kernel_argc
; i
++)
803 size_t arg_len
= strlen (kernel_argv
[i
]) + 1;
804 args
->argv_data
[i
] = (int64_t) &args
->strings
[offset
];
805 memcpy (&args
->strings
[offset
], kernel_argv
[i
], arg_len
+ 1);
808 kernargs
->heap_ptr
= (int64_t) &kernargs
->heap
;
809 kernargs
->heap
.size
= heap_size
;
811 /* Run the kernel on the GPU. */
813 unsigned int return_value
=
814 (unsigned int) kernargs
->output_data
.return_value
;
816 unsigned int upper
= (return_value
& ~0xffff) >> 16;
818 printf ("Kernel exit value was never set\n");
819 else if (upper
== 0xffff)
822 ; /* Set by return from main. */
824 printf ("Possible kernel exit value corruption, 2 most significant bytes "
825 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value
);
829 unsigned int signal
= (return_value
>> 8) & 0xff;
830 if (signal
== SIGABRT
)
831 printf ("Kernel aborted\n");
832 else if (signal
!= 0)
833 printf ("Kernel received unkown signal\n");
837 printf ("Kernel exit value: %d\n", return_value
& 0xff);
839 /* Clean shut down. */
840 XHSA (hsa_fns
.hsa_memory_free_fn (kernargs
),
841 "Clean up device memory");
842 XHSA (hsa_fns
.hsa_executable_destroy_fn (executable
),
843 "Clean up GCN executable");
844 XHSA (hsa_fns
.hsa_queue_destroy_fn (queue
),
845 "Clean up device queue");
846 XHSA (hsa_fns
.hsa_shut_down_fn (),
847 "Shut down run-time");
849 return return_value
& 0xff;