GCN back-end code
[official-gcc.git] / gcc / config / gcn / gcn-run.c
blob58089843ef8a7dfa128bb60803fdde458d6e1a04
1 /* Run a stand-alone AMD GCN kernel.
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
21 The kernel entry point's signature must use a standard main signature:
23 int main(int argc, char **argv)
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
37 /* These probably won't be in elf.h for a while. */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
51 #define reserved 12
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
53 #endif
55 #include "hsa.h"
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
65 bool debug = false;
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t kernel = 0;
70 hsa_executable_t executable = { 0 };
72 hsa_region_t kernargs_region = { 0 };
73 uint32_t kernarg_segment_size = 0;
74 uint32_t group_segment_size = 0;
75 uint32_t private_segment_size = 0;
77 static void
78 usage (const char *progname)
80 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
81 "Options:\n"
82 " --help\n"
83 " --version\n"
84 " --debug\n", progname);
87 static void
88 version (const char *progname)
90 printf ("%s " VERSION_STRING "\n", progname);
93 /* As an HSA runtime is dlopened, following structure defines the necessary
94 function pointers.
95 Code adapted from libgomp. */
97 struct hsa_runtime_fn_info
99 /* HSA runtime. */
100 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
101 const char **status_string);
102 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
103 hsa_agent_info_t attribute,
104 void *value);
105 hsa_status_t (*hsa_init_fn) (void);
106 hsa_status_t (*hsa_iterate_agents_fn)
107 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
108 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
109 hsa_region_info_t attribute,
110 void *value);
111 hsa_status_t (*hsa_queue_create_fn)
112 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
113 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
114 void *data, uint32_t private_segment_size,
115 uint32_t group_segment_size, hsa_queue_t **queue);
116 hsa_status_t (*hsa_agent_iterate_regions_fn)
117 (hsa_agent_t agent,
118 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
119 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
120 hsa_status_t (*hsa_executable_create_fn)
121 (hsa_profile_t profile, hsa_executable_state_t executable_state,
122 const char *options, hsa_executable_t *executable);
123 hsa_status_t (*hsa_executable_global_variable_define_fn)
124 (hsa_executable_t executable, const char *variable_name, void *address);
125 hsa_status_t (*hsa_executable_load_code_object_fn)
126 (hsa_executable_t executable, hsa_agent_t agent,
127 hsa_code_object_t code_object, const char *options);
128 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
129 const char *options);
130 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
131 uint32_t num_consumers,
132 const hsa_agent_t *consumers,
133 hsa_signal_t *signal);
134 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
135 void **ptr);
136 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
137 size_t size);
138 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
139 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
140 hsa_status_t (*hsa_executable_get_symbol_fn)
141 (hsa_executable_t executable, const char *module_name,
142 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
143 hsa_executable_symbol_t *symbol);
144 hsa_status_t (*hsa_executable_symbol_get_info_fn)
145 (hsa_executable_symbol_t executable_symbol,
146 hsa_executable_symbol_info_t attribute, void *value);
147 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
148 hsa_signal_value_t value);
149 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
150 (hsa_signal_t signal, hsa_signal_condition_t condition,
151 hsa_signal_value_t compare_value, uint64_t timeout_hint,
152 hsa_wait_state_t wait_state_hint);
153 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
154 (hsa_signal_t signal, hsa_signal_condition_t condition,
155 hsa_signal_value_t compare_value, uint64_t timeout_hint,
156 hsa_wait_state_t wait_state_hint);
157 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
158 hsa_status_t (*hsa_code_object_deserialize_fn)
159 (void *serialized_code_object, size_t serialized_code_object_size,
160 const char *options, hsa_code_object_t *code_object);
161 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
162 (const hsa_queue_t *queue);
163 void (*hsa_queue_store_write_index_relaxed_fn)
164 (const hsa_queue_t *queue, uint64_t value);
165 hsa_status_t (*hsa_shut_down_fn) ();
168 /* HSA runtime functions that are initialized in init_hsa_context.
169 Code adapted from libgomp. */
171 static struct hsa_runtime_fn_info hsa_fns;
173 #define DLSYM_FN(function) \
174 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
175 if (hsa_fns.function##_fn == NULL) \
176 goto fail;
178 static void
179 init_hsa_runtime_functions (void)
181 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
182 if (handle == NULL)
184 fprintf (stderr,
185 "The HSA runtime is required to run GCN kernels on hardware.\n"
186 "%s: File not found or could not be opened\n",
187 HSA_RUNTIME_LIB);
188 exit (1);
191 DLSYM_FN (hsa_status_string)
192 DLSYM_FN (hsa_agent_get_info)
193 DLSYM_FN (hsa_init)
194 DLSYM_FN (hsa_iterate_agents)
195 DLSYM_FN (hsa_region_get_info)
196 DLSYM_FN (hsa_queue_create)
197 DLSYM_FN (hsa_agent_iterate_regions)
198 DLSYM_FN (hsa_executable_destroy)
199 DLSYM_FN (hsa_executable_create)
200 DLSYM_FN (hsa_executable_global_variable_define)
201 DLSYM_FN (hsa_executable_load_code_object)
202 DLSYM_FN (hsa_executable_freeze)
203 DLSYM_FN (hsa_signal_create)
204 DLSYM_FN (hsa_memory_allocate)
205 DLSYM_FN (hsa_memory_copy)
206 DLSYM_FN (hsa_memory_free)
207 DLSYM_FN (hsa_signal_destroy)
208 DLSYM_FN (hsa_executable_get_symbol)
209 DLSYM_FN (hsa_executable_symbol_get_info)
210 DLSYM_FN (hsa_signal_wait_acquire)
211 DLSYM_FN (hsa_signal_wait_relaxed)
212 DLSYM_FN (hsa_signal_store_relaxed)
213 DLSYM_FN (hsa_queue_destroy)
214 DLSYM_FN (hsa_code_object_deserialize)
215 DLSYM_FN (hsa_queue_load_write_index_relaxed)
216 DLSYM_FN (hsa_queue_store_write_index_relaxed)
217 DLSYM_FN (hsa_shut_down)
219 return;
221 fail:
222 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
223 exit (1);
226 #undef DLSYM_FN
228 /* Report a fatal error STR together with the HSA error corresponding to
229 STATUS and terminate execution of the current process. */
231 static void
232 hsa_fatal (const char *str, hsa_status_t status)
234 const char *hsa_error_msg;
235 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
236 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
237 hsa_error_msg);
238 exit (1);
241 /* Helper macros to ensure we check the return values from the HSA Runtime.
242 These just keep the rest of the code a bit cleaner. */
244 #define XHSA_CMP(FN, CMP, MSG) \
245 do { \
246 hsa_status_t status = (FN); \
247 if (!(CMP)) \
248 hsa_fatal ((MSG), status); \
249 else if (debug) \
250 fprintf (stderr, "%s: OK\n", (MSG)); \
251 } while (0)
252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
254 /* Callback of hsa_iterate_agents.
255 Called once for each available device, and returns "break" when a
256 suitable one has been found. */
258 static hsa_status_t
259 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
261 hsa_device_type_t device_type;
262 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
263 &device_type),
264 "Get agent type");
266 /* Select only GPU devices. */
267 /* TODO: support selecting from multiple GPUs. */
268 if (HSA_DEVICE_TYPE_GPU == device_type)
270 device = agent;
271 return HSA_STATUS_INFO_BREAK;
274 /* The device was not suitable. */
275 return HSA_STATUS_SUCCESS;
278 /* Callback of hsa_iterate_regions.
279 Called once for each available memory region, and returns "break" when a
280 suitable one has been found. */
282 static hsa_status_t
283 get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
285 /* Reject non-global regions. */
286 hsa_region_segment_t segment;
287 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
288 if (HSA_REGION_SEGMENT_GLOBAL != segment)
289 return HSA_STATUS_SUCCESS;
291 /* Find a region with the KERNARG flag set. */
292 hsa_region_global_flag_t flags;
293 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
294 &flags);
295 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
297 kernargs_region = region;
298 return HSA_STATUS_INFO_BREAK;
301 /* The region was not suitable. */
302 return HSA_STATUS_SUCCESS;
305 /* Initialize the HSA Runtime library and GPU device. */
307 static void
308 init_device ()
310 /* Load the shared library and find the API functions. */
311 init_hsa_runtime_functions ();
313 /* Initialize the HSA Runtime. */
314 XHSA (hsa_fns.hsa_init_fn (),
315 "Initialize run-time");
317 /* Select a suitable device.
318 The call-back function, get_gpu_agent, does the selection. */
319 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
320 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
321 "Find a device");
323 /* Initialize the queue used for launching kernels. */
324 uint32_t queue_size = 0;
325 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
326 &queue_size),
327 "Find max queue size");
328 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
329 HSA_QUEUE_TYPE_SINGLE, NULL,
330 NULL, UINT32_MAX, UINT32_MAX, &queue),
331 "Set up a device queue");
333 /* Select a memory region for the kernel arguments.
334 The call-back function, get_kernarg_region, does the selection. */
335 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
336 NULL),
337 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
338 "Locate kernargs memory");
342 /* Read a whole input file.
343 Code copied from mkoffload. */
345 static char *
346 read_file (const char *filename, size_t *plen)
348 size_t alloc = 16384;
349 size_t base = 0;
350 char *buffer;
352 FILE *stream = fopen (filename, "rb");
353 if (!stream)
355 perror (filename);
356 exit (1);
359 if (!fseek (stream, 0, SEEK_END))
361 /* Get the file size. */
362 long s = ftell (stream);
363 if (s >= 0)
364 alloc = s + 100;
365 fseek (stream, 0, SEEK_SET);
367 buffer = malloc (alloc);
369 for (;;)
371 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
373 if (!n)
374 break;
375 base += n;
376 if (base + 1 == alloc)
378 alloc *= 2;
379 buffer = realloc (buffer, alloc);
382 buffer[base] = 0;
383 *plen = base;
385 fclose (stream);
387 return buffer;
390 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
392 static void
393 load_image (const char *filename)
395 size_t image_size;
396 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
398 /* An "executable" consists of one or more code objects. */
399 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
400 HSA_EXECUTABLE_STATE_UNFROZEN, "",
401 &executable),
402 "Initialize GCN executable");
404 /* Hide relocations from the HSA runtime loader.
405 Keep a copy of the unmodified section headers to use later. */
406 Elf64_Shdr *image_sections =
407 (Elf64_Shdr *) ((char *) image + image->e_shoff);
408 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
409 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
410 for (int i = image->e_shnum - 1; i >= 0; i--)
412 if (image_sections[i].sh_type == SHT_RELA
413 || image_sections[i].sh_type == SHT_REL)
414 /* Change section type to something harmless. */
415 image_sections[i].sh_type = SHT_NOTE;
418 /* Add the HSACO to the executable. */
419 hsa_code_object_t co = { 0 };
420 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
421 "Deserialize GCN code object");
422 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
423 ""),
424 "Load GCN code object");
426 /* We're done modifying he executable. */
427 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
428 "Freeze GCN executable");
430 /* Locate the "main" function, and read the kernel's properties. */
431 hsa_executable_symbol_t symbol;
432 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
433 device, 0, &symbol),
434 "Find 'main' function");
435 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
436 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel),
437 "Extract kernel object");
438 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
439 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
440 &kernarg_segment_size),
441 "Extract kernarg segment size");
442 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
443 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
444 &group_segment_size),
445 "Extract group segment size");
446 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
447 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
448 &private_segment_size),
449 "Extract private segment size");
451 /* Find main function in ELF, and calculate actual load offset. */
452 Elf64_Addr load_offset;
453 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
454 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
455 &load_offset),
456 "Extract 'main' symbol address");
457 for (int i = 0; i < image->e_shnum; i++)
458 if (sections[i].sh_type == SHT_SYMTAB)
460 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
461 char *strings = (char *) image + strtab->sh_offset;
463 for (size_t offset = 0;
464 offset < sections[i].sh_size;
465 offset += sections[i].sh_entsize)
467 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
468 + sections[i].sh_offset + offset);
469 if (strcmp ("main", strings + sym->st_name) == 0)
471 load_offset -= sym->st_value;
472 goto found_main;
476 /* We only get here when main was not found.
477 This should never happen. */
478 fprintf (stderr, "Error: main function not found.\n");
479 abort ();
480 found_main:;
482 /* Find dynamic symbol table. */
483 Elf64_Shdr *dynsym = NULL;
484 for (int i = 0; i < image->e_shnum; i++)
485 if (sections[i].sh_type == SHT_DYNSYM)
487 dynsym = &sections[i];
488 break;
491 /* Fix up relocations. */
492 for (int i = 0; i < image->e_shnum; i++)
494 if (sections[i].sh_type == SHT_RELA)
495 for (size_t offset = 0;
496 offset < sections[i].sh_size;
497 offset += sections[i].sh_entsize)
499 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
500 + sections[i].sh_offset
501 + offset);
502 Elf64_Sym *sym =
503 (dynsym
504 ? (Elf64_Sym *) ((char *) image
505 + dynsym->sh_offset
506 + (dynsym->sh_entsize
507 * ELF64_R_SYM (reloc->r_info))) : NULL);
509 int64_t S = (sym ? sym->st_value : 0);
510 int64_t P = reloc->r_offset + load_offset;
511 int64_t A = reloc->r_addend;
512 int64_t B = load_offset;
513 int64_t V, size;
514 switch (ELF64_R_TYPE (reloc->r_info))
516 case R_AMDGPU_ABS32_LO:
517 V = (S + A) & 0xFFFFFFFF;
518 size = 4;
519 break;
520 case R_AMDGPU_ABS32_HI:
521 V = (S + A) >> 32;
522 size = 4;
523 break;
524 case R_AMDGPU_ABS64:
525 V = S + A;
526 size = 8;
527 break;
528 case R_AMDGPU_REL32:
529 V = S + A - P;
530 size = 4;
531 break;
532 case R_AMDGPU_REL64:
533 /* FIXME
534 LLD seems to emit REL64 where the the assembler has ABS64.
535 This is clearly wrong because it's not what the compiler
536 is expecting. Let's assume, for now, that it's a bug.
537 In any case, GCN kernels are always self contained and
538 therefore relative relocations will have been resolved
539 already, so this should be a safe workaround. */
540 V = S + A /* - P */ ;
541 size = 8;
542 break;
543 case R_AMDGPU_ABS32:
544 V = S + A;
545 size = 4;
546 break;
547 /* TODO R_AMDGPU_GOTPCREL */
548 /* TODO R_AMDGPU_GOTPCREL32_LO */
549 /* TODO R_AMDGPU_GOTPCREL32_HI */
550 case R_AMDGPU_REL32_LO:
551 V = (S + A - P) & 0xFFFFFFFF;
552 size = 4;
553 break;
554 case R_AMDGPU_REL32_HI:
555 V = (S + A - P) >> 32;
556 size = 4;
557 break;
558 case R_AMDGPU_RELATIVE64:
559 V = B + A;
560 size = 8;
561 break;
562 default:
563 fprintf (stderr, "Error: unsupported relocation type.\n");
564 exit (1);
566 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
567 "Fix up relocation");
572 /* Allocate some device memory from the kernargs region.
573 The returned address will be 32-bit (with excess zeroed on 64-bit host),
574 and accessible via the same address on both host and target (via
575 __flat_scalar GCN address space). */
577 static void *
578 device_malloc (size_t size)
580 void *result;
581 XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
582 "Allocate device memory");
583 return result;
586 /* These are the device pointers that will be transferred to the target.
587 The HSA Runtime points the kernargs register here.
588 They correspond to function signature:
589 int main (int argc, char *argv[], int *return_value)
590 The compiler expects this, for kernel functions, and will
591 automatically assign the exit value to *return_value. */
592 struct kernargs
594 /* Kernargs. */
595 int32_t argc;
596 int64_t argv;
597 int64_t out_ptr;
598 int64_t heap_ptr;
600 /* Output data. */
601 struct output
603 int return_value;
604 int next_output;
605 struct printf_data
607 int written;
608 char msg[128];
609 int type;
610 union
612 int64_t ivalue;
613 double dvalue;
614 char text[128];
616 } queue[1000];
617 } output_data;
619 struct heap
621 int64_t size;
622 char data[0];
623 } heap;
626 /* Print any console output from the kernel.
627 We print all entries from print_index to the next entry without a "written"
628 flag. Subsequent calls should use the returned print_index value to resume
629 from the same point. */
630 void
631 gomp_print_output (struct kernargs *kernargs, int *print_index)
633 int limit = (sizeof (kernargs->output_data.queue)
634 / sizeof (kernargs->output_data.queue[0]));
636 int i;
637 for (i = *print_index; i < limit; i++)
639 struct printf_data *data = &kernargs->output_data.queue[i];
641 if (!data->written)
642 break;
644 switch (data->type)
646 case 0:
647 printf ("%.128s%ld\n", data->msg, data->ivalue);
648 break;
649 case 1:
650 printf ("%.128s%f\n", data->msg, data->dvalue);
651 break;
652 case 2:
653 printf ("%.128s%.128s\n", data->msg, data->text);
654 break;
655 case 3:
656 printf ("%.128s%.128s", data->msg, data->text);
657 break;
660 data->written = 0;
663 if (*print_index < limit && i == limit
664 && kernargs->output_data.next_output > limit)
665 printf ("WARNING: GCN print buffer exhausted.\n");
667 *print_index = i;
670 /* Execute an already-loaded kernel on the device. */
672 static void
673 run (void *kernargs)
675 /* A "signal" is used to launch and monitor the kernel. */
676 hsa_signal_t signal;
677 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
678 "Create signal");
680 /* Configure for a single-worker kernel. */
681 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
682 const uint32_t queueMask = queue->size - 1;
683 hsa_kernel_dispatch_packet_t *dispatch_packet =
684 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
685 queueMask]);
686 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
687 dispatch_packet->workgroup_size_x = (uint16_t) 1;
688 dispatch_packet->workgroup_size_y = (uint16_t) 64;
689 dispatch_packet->workgroup_size_z = (uint16_t) 1;
690 dispatch_packet->grid_size_x = 1;
691 dispatch_packet->grid_size_y = 64;
692 dispatch_packet->grid_size_z = 1;
693 dispatch_packet->completion_signal = signal;
694 dispatch_packet->kernel_object = kernel;
695 dispatch_packet->kernarg_address = (void *) kernargs;
696 dispatch_packet->private_segment_size = private_segment_size;
697 dispatch_packet->group_segment_size = group_segment_size;
699 uint16_t header = 0;
700 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
701 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
702 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
704 __atomic_store_n ((uint32_t *) dispatch_packet,
705 header | (dispatch_packet->setup << 16),
706 __ATOMIC_RELEASE);
708 if (debug)
709 fprintf (stderr, "Launch kernel\n");
711 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
712 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
713 /* Kernel running ...... */
714 int print_index = 0;
715 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
716 1, 1000000,
717 HSA_WAIT_STATE_ACTIVE) != 0)
719 usleep (10000);
720 gomp_print_output (kernargs, &print_index);
723 gomp_print_output (kernargs, &print_index);
725 if (debug)
726 fprintf (stderr, "Kernel exited\n");
728 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
729 "Clean up signal");
733 main (int argc, char *argv[])
735 int kernel_arg = 0;
736 for (int i = 1; i < argc; i++)
738 if (!strcmp (argv[i], "--help"))
740 usage (argv[0]);
741 return 0;
743 else if (!strcmp (argv[i], "--version"))
745 version (argv[0]);
746 return 0;
748 else if (!strcmp (argv[i], "--debug"))
749 debug = true;
750 else if (argv[i][0] == '-')
752 usage (argv[0]);
753 return 1;
755 else
757 kernel_arg = i;
758 break;
762 if (!kernel_arg)
764 /* No kernel arguments were found. */
765 usage (argv[0]);
766 return 1;
769 /* The remaining arguments are for the GCN kernel. */
770 int kernel_argc = argc - kernel_arg;
771 char **kernel_argv = &argv[kernel_arg];
773 init_device ();
774 load_image (kernel_argv[0]);
776 /* Calculate size of function parameters + argv data. */
777 size_t args_size = 0;
778 for (int i = 0; i < kernel_argc; i++)
779 args_size += strlen (kernel_argv[i]) + 1;
781 /* Allocate device memory for both function parameters and the argv
782 data. */
783 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
784 struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
785 struct argdata
787 int64_t argv_data[kernel_argc];
788 char strings[args_size];
789 } *args = device_malloc (sizeof (struct argdata));
791 /* Write the data to the target. */
792 kernargs->argc = kernel_argc;
793 kernargs->argv = (int64_t) args->argv_data;
794 kernargs->out_ptr = (int64_t) &kernargs->output_data;
795 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
796 kernargs->output_data.next_output = 0;
797 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
798 / sizeof (kernargs->output_data.queue[0])); i++)
799 kernargs->output_data.queue[i].written = 0;
800 int offset = 0;
801 for (int i = 0; i < kernel_argc; i++)
803 size_t arg_len = strlen (kernel_argv[i]) + 1;
804 args->argv_data[i] = (int64_t) &args->strings[offset];
805 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
806 offset += arg_len;
808 kernargs->heap_ptr = (int64_t) &kernargs->heap;
809 kernargs->heap.size = heap_size;
811 /* Run the kernel on the GPU. */
812 run (kernargs);
813 unsigned int return_value =
814 (unsigned int) kernargs->output_data.return_value;
816 unsigned int upper = (return_value & ~0xffff) >> 16;
817 if (upper == 0xcafe)
818 printf ("Kernel exit value was never set\n");
819 else if (upper == 0xffff)
820 ; /* Set by exit. */
821 else if (upper == 0)
822 ; /* Set by return from main. */
823 else
824 printf ("Possible kernel exit value corruption, 2 most significant bytes "
825 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
827 if (upper == 0xffff)
829 unsigned int signal = (return_value >> 8) & 0xff;
830 if (signal == SIGABRT)
831 printf ("Kernel aborted\n");
832 else if (signal != 0)
833 printf ("Kernel received unkown signal\n");
836 if (debug)
837 printf ("Kernel exit value: %d\n", return_value & 0xff);
839 /* Clean shut down. */
840 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
841 "Clean up device memory");
842 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
843 "Clean up GCN executable");
844 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
845 "Clean up device queue");
846 XHSA (hsa_fns.hsa_shut_down_fn (),
847 "Shut down run-time");
849 return return_value & 0xff;