libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2023 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199 };
 200
 201 /* Structure describing the run-time and grid properties of an HSA kernel
 202    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 203
 204 struct GOMP_kernel_launch_attributes
 205 {
 206   /* Number of dimensions the workload has.  Maximum number is 3.  */
 207   uint32_t ndim;
 208   /* Size of the grid in the three respective dimensions.  */
 209   uint32_t gdims[3];
 210   /* Size of work-groups in the respective dimensions.  */
 211   uint32_t wdims[3];
 212 };
 213
 214 /* Collection of information needed for a dispatch of a kernel from a
 215    kernel.  */
 216
 217 struct kernel_dispatch
 218 {
 219   struct agent_info *agent;
 220   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 221   void *queue;
 222   /* Pointer to a memory space used for kernel arguments passing.  */
 223   void *kernarg_address;
 224   /* Kernel object.  */
 225   uint64_t object;
 226   /* Synchronization signal used for dispatch synchronization.  */
 227   uint64_t signal;
 228   /* Private segment size.  */
 229   uint32_t private_segment_size;
 230   /* Group segment size.  */
 231   uint32_t group_segment_size;
 232 };
 233
 234 /* Structure of the kernargs segment, supporting console output.
 235
 236    This needs to match the definitions in Newlib, and the expectations
 237    in libgomp target code.  */
 238
 239 struct kernargs {
 240   struct kernargs_abi abi;
 241
 242   /* Output data.  */
 243   struct output output_data;
 244 };
 245
 246 /* A queue entry for a future asynchronous launch.  */
 247
 248 struct kernel_launch
 249 {
 250   struct kernel_info *kernel;
 251   void *vars;
 252   struct GOMP_kernel_launch_attributes kla;
 253 };
 254
 255 /* A queue entry for a future callback.  */
 256
 257 struct callback
 258 {
 259   void (*fn)(void *);
 260   void *data;
 261 };
 262
 263 /* A data struct for the copy_data callback.  */
 264
 265 struct copy_data
 266 {
 267   void *dst;
 268   const void *src;
 269   size_t len;
 270   struct goacc_asyncqueue *aq;
 271 };
 272
 273 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 274
 275 struct placeholder
 276 {
 277   int executed;
 278   pthread_cond_t cond;
 279   pthread_mutex_t mutex;
 280 };
 281
 282 /* A queue entry for a wait directive.  */
 283
 284 struct asyncwait_info
 285 {
 286   struct placeholder *placeholderp;
 287 };
 288
 289 /* Encode the type of an entry in an async queue.  */
 290
 291 enum entry_type
 292 {
 293   KERNEL_LAUNCH,
 294   CALLBACK,
 295   ASYNC_WAIT,
 296   ASYNC_PLACEHOLDER
 297 };
 298
 299 /* An entry in an async queue.  */
 300
 301 struct queue_entry
 302 {
 303   enum entry_type type;
 304   union {
 305     struct kernel_launch launch;
 306     struct callback callback;
 307     struct asyncwait_info asyncwait;
 308     struct placeholder placeholder;
 309   } u;
 310 };
 311
 312 /* An async queue header.
 313
 314    OpenMP may create one of these.
 315    OpenACC may create many.  */
 316
 317 struct goacc_asyncqueue
 318 {
 319   struct agent_info *agent;
 320   hsa_queue_t *hsa_queue;
 321
 322   pthread_t thread_drain_queue;
 323   pthread_mutex_t mutex;
 324   pthread_cond_t queue_cond_in;
 325   pthread_cond_t queue_cond_out;
 326   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 327   int queue_first;
 328   int queue_n;
 329   int drain_queue_stop;
 330
 331   int id;
 332   struct goacc_asyncqueue *prev;
 333   struct goacc_asyncqueue *next;
 334 };
 335
 336 /* Mkoffload uses this structure to describe a kernel.
 337
 338    OpenMP kernel dimensions are passed at runtime.
 339    OpenACC kernel dimensions are passed at compile time, here.  */
 340
 341 struct hsa_kernel_description
 342 {
 343   const char *name;
 344   int oacc_dims[3];  /* Only present for GCN kernels.  */
 345   int sgpr_count;
 346   int vpgr_count;
 347 };
 348
 349 /* Mkoffload uses this structure to describe an offload variable.  */
 350
 351 struct global_var_info
 352 {
 353   const char *name;
 354   void *address;
 355 };
 356
 357 /* Mkoffload uses this structure to describe all the kernels in a
 358    loadable module.  These are passed the libgomp via static constructors.  */
 359
 360 struct gcn_image_desc
 361 {
 362   struct gcn_image {
 363     size_t size;
 364     void *image;
 365   } *gcn_image;
 366   const unsigned kernel_count;
 367   struct hsa_kernel_description *kernel_infos;
 368   const unsigned ind_func_count;
 369   const unsigned global_variable_count;
 370 };
 371
 372 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 373    support.
 374    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 375
 376 typedef enum {
 377   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 378   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 379   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 380   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
 381   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
 382   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036
 383 } EF_AMDGPU_MACH;
 384
 385 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 386 typedef EF_AMDGPU_MACH gcn_isa;
 387
 388 /* Description of an HSA GPU agent (device) and the program associated with
 389    it.  */
 390
 391 struct agent_info
 392 {
 393   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 394   hsa_agent_t id;
 395   /* The user-visible device number.  */
 396   int device_id;
 397   /* Whether the agent has been initialized.  The fields below are usable only
 398      if it has been.  */
 399   bool initialized;
 400
 401   /* The instruction set architecture of the device. */
 402   gcn_isa device_isa;
 403   /* Name of the agent. */
 404   char name[64];
 405   /* Name of the vendor of the agent. */
 406   char vendor_name[64];
 407   /* Command queues of the agent.  */
 408   hsa_queue_t *sync_queue;
 409   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 410   pthread_mutex_t async_queues_mutex;
 411
 412   /* The HSA memory region from which to allocate kernel arguments.  */
 413   hsa_region_t kernarg_region;
 414
 415   /* The HSA memory region from which to allocate device data.  */
 416   hsa_region_t data_region;
 417
 418   /* Allocated ephemeral memories (team arena and stack space).  */
 419   struct ephemeral_memories_list *ephemeral_memories_list;
 420   pthread_mutex_t ephemeral_memories_write_lock;
 421
 422   /* Read-write lock that protects kernels which are running or about to be run
 423      from interference with loading and unloading of images.  Needs to be
 424      locked for reading while a kernel is being run, and for writing if the
 425      list of modules is manipulated (and thus the HSA program invalidated).  */
 426   pthread_rwlock_t module_rwlock;
 427
 428   /* The module associated with this kernel.  */
 429   struct module_info *module;
 430
 431   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 432      thread should have locked agent->module_rwlock for reading before
 433      acquiring it.  */
 434   pthread_mutex_t prog_mutex;
 435   /* Flag whether the HSA program that consists of all the modules has been
 436      finalized.  */
 437   bool prog_finalized;
 438   /* HSA executable - the finalized program that is used to locate kernels.  */
 439   hsa_executable_t executable;
 440 };
 441
 442 /* Information required to identify, finalize and run any given kernel.  */
 443
 444 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 445
 446 struct kernel_info
 447 {
 448   /* Name of the kernel, required to locate it within the GCN object-code
 449      module.  */
 450   const char *name;
 451   /* The specific agent the kernel has been or will be finalized for and run
 452      on.  */
 453   struct agent_info *agent;
 454   /* The specific module where the kernel takes place.  */
 455   struct module_info *module;
 456   /* Information provided by mkoffload associated with the kernel.  */
 457   struct hsa_kernel_description *description;
 458   /* Mutex enforcing that at most once thread ever initializes a kernel for
 459      use.  A thread should have locked agent->module_rwlock for reading before
 460      acquiring it.  */
 461   pthread_mutex_t init_mutex;
 462   /* Flag indicating whether the kernel has been initialized and all fields
 463      below it contain valid data.  */
 464   bool initialized;
 465   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 466   bool initialization_failed;
 467   /* The object to be put into the dispatch queue.  */
 468   uint64_t object;
 469   /* Required size of kernel arguments.  */
 470   uint32_t kernarg_segment_size;
 471   /* Required size of group segment.  */
 472   uint32_t group_segment_size;
 473   /* Required size of private segment.  */
 474   uint32_t private_segment_size;
 475   /* Set up for OpenMP or OpenACC?  */
 476   enum offload_kind kind;
 477 };
 478
 479 /* Information about a particular GCN module, its image and kernels.  */
 480
 481 struct module_info
 482 {
 483   /* The description with which the program has registered the image.  */
 484   struct gcn_image_desc *image_desc;
 485   /* GCN heap allocation.  */
 486   struct heap *heap;
 487   /* Physical boundaries of the loaded module.  */
 488   Elf64_Addr phys_address_start;
 489   Elf64_Addr phys_address_end;
 490
 491   bool constructors_run_p;
 492   struct kernel_info *init_array_func, *fini_array_func;
 493
 494   /* Number of kernels in this module.  */
 495   int kernel_count;
 496   /* An array of kernel_info structures describing each kernel in this
 497      module.  */
 498   struct kernel_info kernels[];
 499 };
 500
 501 /* A linked list of memory arenas allocated on the device.
 502    These are used by OpenMP, as a means to optimize per-team malloc,
 503    and for host-accessible stack space.  */
 504
 505 struct ephemeral_memories_list
 506 {
 507   struct ephemeral_memories_list *next;
 508
 509   /* The size is determined by the number of teams and threads.  */
 510   size_t size;
 511   /* The device address allocated memory.  */
 512   void *address;
 513   /* A flag to prevent two asynchronous kernels trying to use the same memory.
 514      The mutex is locked until the kernel exits.  */
 515   pthread_mutex_t in_use;
 516 };
 517
 518 /* Information about the whole HSA environment and all of its agents.  */
 519
 520 struct hsa_context_info
 521 {
 522   /* Whether the structure has been initialized.  */
 523   bool initialized;
 524   /* Number of usable GPU HSA agents in the system.  */
 525   int agent_count;
 526   /* Array of agent_info structures describing the individual HSA agents.  */
 527   struct agent_info *agents;
 528   /* Driver version string. */
 529   char driver_version_s[30];
 530 };
 531
 532 /* }}}  */
 533 /* {{{ Global variables  */
 534
 535 /* Information about the whole HSA environment and all of its agents.  */
 536
 537 static struct hsa_context_info hsa_context;
 538
 539 /* HSA runtime functions that are initialized in init_hsa_context.  */
 540
 541 static struct hsa_runtime_fn_info hsa_fns;
 542
 543 /* Heap space, allocated target-side, provided for use of newlib malloc.
 544    Each module should have it's own heap allocated.
 545    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 546
 547 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 548
 549 /* Ephemeral memory sizes for each kernel launch.  */
 550
 551 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 552 static int stack_size = DEFAULT_GCN_STACK_SIZE;
 553 static int lowlat_size = -1;
 554
 555 /* Flag to decide whether print to stderr information about what is going on.
 556    Set in init_debug depending on environment variables.  */
 557
 558 static bool debug;
 559
 560 /* Flag to decide if the runtime should suppress a possible fallback to host
 561    execution.  */
 562
 563 static bool suppress_host_fallback;
 564
 565 /* Flag to locate HSA runtime shared library that is dlopened
 566    by this plug-in.  */
 567
 568 static const char *hsa_runtime_lib;
 569
 570 /* Flag to decide if the runtime should support also CPU devices (can be
 571    a simulator).  */
 572
 573 static bool support_cpu_devices;
 574
 575 /* Runtime dimension overrides.  Zero indicates default.  */
 576
 577 static int override_x_dim = 0;
 578 static int override_z_dim = 0;
 579
 580 /* }}}  */
 581 /* {{{ Debug & Diagnostic  */
 582
 583 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 584
 585 #define DEBUG_PRINT(...) \
 586   do \
 587   { \
 588     if (debug) \
 589       { \
 590         fprintf (stderr, __VA_ARGS__); \
 591       } \
 592   } \
 593   while (false);
 594
 595 /* Flush stderr if GCN_DEBUG value is set to true.  */
 596
 597 #define DEBUG_FLUSH()                           \
 598   do {                                          \
 599     if (debug)                                  \
 600       fflush (stderr);                          \
 601   } while (false)
 602
 603 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 604    is set to true.  */
 605
 606 #define DEBUG_LOG(prefix, ...)                  \
 607   do                                            \
 608     {                                           \
 609       DEBUG_PRINT (prefix);                     \
 610       DEBUG_PRINT (__VA_ARGS__);                        \
 611       DEBUG_FLUSH ();                           \
 612     } while (false)
 613
 614 /* Print a debugging message to stderr.  */
 615
 616 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 617
 618 /* Print a warning message to stderr.  */
 619
 620 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 621
 622 /* Print HSA warning STR with an HSA STATUS code.  */
 623
 624 static void
 625 hsa_warn (const char *str, hsa_status_t status)
 626 {
 627   if (!debug)
 628     return;
 629
 630   const char *hsa_error_msg = "[unknown]";
 631   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 632
 633   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 634            hsa_error_msg);
 635 }
 636
 637 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 638    and terminate execution of the current process.  */
 639
 640 static void
 641 hsa_fatal (const char *str, hsa_status_t status)
 642 {
 643   const char *hsa_error_msg = "[unknown]";
 644   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 645   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 646                      hsa_error_msg);
 647 }
 648
 649 /* Like hsa_fatal, except only report error message, and return FALSE
 650    for propagating error processing to outside of plugin.  */
 651
 652 static bool
 653 hsa_error (const char *str, hsa_status_t status)
 654 {
 655   const char *hsa_error_msg = "[unknown]";
 656   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 657   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 658                      hsa_error_msg);
 659   return false;
 660 }
 661
 662 /* Dump information about the available hardware.  */
 663
 664 static void
 665 dump_hsa_system_info (void)
 666 {
 667   hsa_status_t status;
 668
 669   hsa_endianness_t endianness;
 670   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 671                                            &endianness);
 672   if (status == HSA_STATUS_SUCCESS)
 673     switch (endianness)
 674       {
 675       case HSA_ENDIANNESS_LITTLE:
 676         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 677         break;
 678       case HSA_ENDIANNESS_BIG:
 679         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 680         break;
 681       default:
 682         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 683       }
 684   else
 685     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 686
 687   uint8_t extensions[128];
 688   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 689                                            &extensions);
 690   if (status == HSA_STATUS_SUCCESS)
 691     {
 692       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 693         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 694     }
 695   else
 696     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 697 }
 698
 699 /* Dump information about the available hardware.  */
 700
 701 static void
 702 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 703 {
 704   switch (machine_model)
 705     {
 706     case HSA_MACHINE_MODEL_SMALL:
 707       GCN_DEBUG ("%s: SMALL\n", s);
 708       break;
 709     case HSA_MACHINE_MODEL_LARGE:
 710       GCN_DEBUG ("%s: LARGE\n", s);
 711       break;
 712     default:
 713       GCN_WARNING ("%s: UNKNOWN\n", s);
 714       break;
 715     }
 716 }
 717
 718 /* Dump information about the available hardware.  */
 719
 720 static void
 721 dump_profile (hsa_profile_t profile, const char *s)
 722 {
 723   switch (profile)
 724     {
 725     case HSA_PROFILE_FULL:
 726       GCN_DEBUG ("%s: FULL\n", s);
 727       break;
 728     case HSA_PROFILE_BASE:
 729       GCN_DEBUG ("%s: BASE\n", s);
 730       break;
 731     default:
 732       GCN_WARNING ("%s: UNKNOWN\n", s);
 733       break;
 734     }
 735 }
 736
 737 /* Dump information about a device memory region.  */
 738
 739 static hsa_status_t
 740 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 741 {
 742   hsa_status_t status;
 743
 744   hsa_region_segment_t segment;
 745   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 746                                            &segment);
 747   if (status == HSA_STATUS_SUCCESS)
 748     {
 749       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 750         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 751       else if (segment == HSA_REGION_SEGMENT_READONLY)
 752         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 753       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 754         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 755       else if (segment == HSA_REGION_SEGMENT_GROUP)
 756         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 757       else
 758         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 759     }
 760   else
 761     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 762
 763   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 764     {
 765       uint32_t flags;
 766       status
 767         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 768                                           &flags);
 769       if (status == HSA_STATUS_SUCCESS)
 770         {
 771           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 772             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 773           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 774             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 775           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 776             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 777         }
 778       else
 779         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 780     }
 781
 782   size_t size;
 783   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 784   if (status == HSA_STATUS_SUCCESS)
 785     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 786   else
 787     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 788
 789   status
 790     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 791                                       &size);
 792   if (status == HSA_STATUS_SUCCESS)
 793     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 794   else
 795     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 796
 797   bool alloc_allowed;
 798   status
 799     = hsa_fns.hsa_region_get_info_fn (region,
 800                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 801                                       &alloc_allowed);
 802   if (status == HSA_STATUS_SUCCESS)
 803     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 804   else
 805     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 806
 807   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 808     return HSA_STATUS_SUCCESS;
 809
 810   status
 811     = hsa_fns.hsa_region_get_info_fn (region,
 812                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 813                                       &size);
 814   if (status == HSA_STATUS_SUCCESS)
 815     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 816   else
 817     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 818
 819   size_t align;
 820   status
 821     = hsa_fns.hsa_region_get_info_fn (region,
 822                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 823                                       &align);
 824   if (status == HSA_STATUS_SUCCESS)
 825     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 826   else
 827     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 828
 829   return HSA_STATUS_SUCCESS;
 830 }
 831
 832 /* Dump information about all the device memory regions.  */
 833
 834 static void
 835 dump_hsa_regions (hsa_agent_t agent)
 836 {
 837   hsa_status_t status;
 838   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 839                                                  dump_hsa_region,
 840                                                  NULL);
 841   if (status != HSA_STATUS_SUCCESS)
 842     hsa_error ("Dumping hsa regions failed", status);
 843 }
 844
 845 /* Dump information about the available devices.  */
 846
 847 static hsa_status_t
 848 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 849 {
 850   hsa_status_t status;
 851
 852   char buf[64];
 853   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 854                                           &buf);
 855   if (status == HSA_STATUS_SUCCESS)
 856     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 857   else
 858     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 859
 860   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 861                                           &buf);
 862   if (status == HSA_STATUS_SUCCESS)
 863     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 864   else
 865     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 866
 867   hsa_machine_model_t machine_model;
 868   status
 869     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 870                                      &machine_model);
 871   if (status == HSA_STATUS_SUCCESS)
 872     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 873   else
 874     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 875
 876   hsa_profile_t profile;
 877   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 878                                           &profile);
 879   if (status == HSA_STATUS_SUCCESS)
 880     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 881   else
 882     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 883
 884   hsa_device_type_t device_type;
 885   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 886                                           &device_type);
 887   if (status == HSA_STATUS_SUCCESS)
 888     {
 889       switch (device_type)
 890         {
 891         case HSA_DEVICE_TYPE_CPU:
 892           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 893           break;
 894         case HSA_DEVICE_TYPE_GPU:
 895           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 896           break;
 897         case HSA_DEVICE_TYPE_DSP:
 898           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 899           break;
 900         default:
 901           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 902           break;
 903         }
 904     }
 905   else
 906     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 907
 908   uint32_t cu_count;
 909   status = hsa_fns.hsa_agent_get_info_fn
 910     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 911   if (status == HSA_STATUS_SUCCESS)
 912     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 913   else
 914     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 915
 916   uint32_t size;
 917   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 918                                           &size);
 919   if (status == HSA_STATUS_SUCCESS)
 920     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 921   else
 922     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 923
 924   uint32_t max_dim;
 925   status = hsa_fns.hsa_agent_get_info_fn (agent,
 926                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 927                                           &max_dim);
 928   if (status == HSA_STATUS_SUCCESS)
 929     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 930   else
 931     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 932
 933   uint32_t max_size;
 934   status = hsa_fns.hsa_agent_get_info_fn (agent,
 935                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 936                                           &max_size);
 937   if (status == HSA_STATUS_SUCCESS)
 938     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 939   else
 940     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 941
 942   uint32_t grid_max_dim;
 943   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 944                                           &grid_max_dim);
 945   if (status == HSA_STATUS_SUCCESS)
 946     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 947   else
 948     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 949
 950   uint32_t grid_max_size;
 951   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 952                                           &grid_max_size);
 953   if (status == HSA_STATUS_SUCCESS)
 954     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 955   else
 956     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 957
 958   dump_hsa_regions (agent);
 959
 960   return HSA_STATUS_SUCCESS;
 961 }
 962
 963 /* Forward reference.  */
 964
 965 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 966
 967 /* Helper function for dump_executable_symbols.  */
 968
 969 static hsa_status_t
 970 dump_executable_symbol (hsa_executable_t executable,
 971                         hsa_executable_symbol_t symbol,
 972                         void *data __attribute__((unused)))
 973 {
 974   char *name = get_executable_symbol_name (symbol);
 975
 976   if (name)
 977     {
 978       GCN_DEBUG ("executable symbol: %s\n", name);
 979       free (name);
 980     }
 981
 982   return HSA_STATUS_SUCCESS;
 983 }
 984
 985 /* Dump all global symbol in an executable.  */
 986
 987 static void
 988 dump_executable_symbols (hsa_executable_t executable)
 989 {
 990   hsa_status_t status;
 991   status
 992     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
 993                                                  dump_executable_symbol,
 994                                                  NULL);
 995   if (status != HSA_STATUS_SUCCESS)
 996     hsa_fatal ("Could not dump HSA executable symbols", status);
 997 }
 998
 999 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1000
1001 static void
1002 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1003 {
1004   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1005
1006   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1007   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1008   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1009   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1010            (void*)kernargs->abi.heap_ptr);
1011   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1012            "", (void*)kernargs->abi.arena_ptr,
1013            kernargs->abi.arena_size_per_team);
1014   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1015            "", (void*)kernargs->abi.stack_ptr,
1016            kernargs->abi.stack_size_per_thread);
1017   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1018   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1019            dispatch->private_segment_size);
1020   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1021            "", dispatch->group_segment_size);
1022   fprintf (stderr, "\n");
1023 }
1024
1025 /* }}}  */
1026 /* {{{ Utility functions  */
1027
1028 /* Cast the thread local storage to gcn_thread.  */
1029
1030 static inline struct gcn_thread *
1031 gcn_thread (void)
1032 {
1033   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1034 }
1035
1036 /* Initialize debug and suppress_host_fallback according to the environment.  */
1037
1038 static void
1039 init_environment_variables (void)
1040 {
1041   if (secure_getenv ("GCN_DEBUG"))
1042     debug = true;
1043   else
1044     debug = false;
1045
1046   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1047     suppress_host_fallback = true;
1048   else
1049     suppress_host_fallback = false;
1050
1051   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1052   if (hsa_runtime_lib == NULL)
1053     hsa_runtime_lib = "libhsa-runtime64.so.1";
1054
1055   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1056
1057   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1058   if (!x)
1059     x = secure_getenv ("GCN_NUM_GANGS");
1060   if (x)
1061     override_x_dim = atoi (x);
1062
1063   const char *z = secure_getenv ("GCN_NUM_THREADS");
1064   if (!z)
1065     z = secure_getenv ("GCN_NUM_WORKERS");
1066   if (z)
1067     override_z_dim = atoi (z);
1068
1069   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1070   if (heap)
1071     {
1072       size_t tmp = atol (heap);
1073       if (tmp)
1074         gcn_kernel_heap_size = tmp;
1075     }
1076
1077   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1078   if (arena)
1079     {
1080       int tmp = atoi (arena);
1081       if (tmp)
1082         team_arena_size = tmp;;
1083     }
1084
1085   const char *stack = secure_getenv ("GCN_STACK_SIZE");
1086   if (stack)
1087     {
1088       int tmp = atoi (stack);
1089       if (tmp)
1090         stack_size = tmp;;
1091     }
1092
1093   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1094   if (lowlat)
1095     lowlat_size = atoi (lowlat);
1096 }
1097
1098 /* Return malloc'd string with name of SYMBOL.  */
1099
1100 static char *
1101 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1102 {
1103   hsa_status_t status;
1104   char *res;
1105   uint32_t len;
1106   const hsa_executable_symbol_info_t info_name_length
1107     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1108
1109   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1110                                                       &len);
1111   if (status != HSA_STATUS_SUCCESS)
1112     {
1113       hsa_error ("Could not get length of symbol name", status);
1114       return NULL;
1115     }
1116
1117   res = GOMP_PLUGIN_malloc (len + 1);
1118
1119   const hsa_executable_symbol_info_t info_name
1120     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1121
1122   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1123
1124   if (status != HSA_STATUS_SUCCESS)
1125     {
1126       hsa_error ("Could not get symbol name", status);
1127       free (res);
1128       return NULL;
1129     }
1130
1131   res[len] = '\0';
1132
1133   return res;
1134 }
1135
1136 /* Get the number of GPU Compute Units.  */
1137
1138 static int
1139 get_cu_count (struct agent_info *agent)
1140 {
1141   uint32_t cu_count;
1142   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1143     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1144   if (status == HSA_STATUS_SUCCESS)
1145     return cu_count;
1146   else
1147     return 64;  /* The usual number for older devices.  */
1148 }
1149
1150 /* Calculate the maximum grid size for OMP threads / OACC workers.
1151    This depends on the kernel's resource usage levels.  */
1152
1153 static int
1154 limit_worker_threads (int threads)
1155 {
1156   /* FIXME Do something more inteligent here.
1157      GCN can always run 4 threads within a Compute Unit, but
1158      more than that depends on register usage.  */
1159   if (threads > 16)
1160     threads = 16;
1161   return threads;
1162 }
1163
1164 /* This sets the maximum number of teams to twice the number of GPU Compute
1165    Units to avoid memory waste and corresponding memory access faults.  */
1166
1167 static int
1168 limit_teams (int teams, struct agent_info *agent)
1169 {
1170   int max_teams = 2 * get_cu_count (agent);
1171   if (teams > max_teams)
1172     teams = max_teams;
1173   return teams;
1174 }
1175
1176 /* Parse the target attributes INPUT provided by the compiler and return true
1177    if we should run anything all.  If INPUT is NULL, fill DEF with default
1178    values, then store INPUT or DEF into *RESULT.
1179
1180    This is used for OpenMP only.  */
1181
1182 static bool
1183 parse_target_attributes (void **input,
1184                          struct GOMP_kernel_launch_attributes *def,
1185                          struct GOMP_kernel_launch_attributes **result,
1186                          struct agent_info *agent)
1187 {
1188   if (!input)
1189     GOMP_PLUGIN_fatal ("No target arguments provided");
1190
1191   bool grid_attrs_found = false;
1192   bool gcn_dims_found = false;
1193   int gcn_teams = 0;
1194   int gcn_threads = 0;
1195   while (*input)
1196     {
1197       intptr_t id = (intptr_t) *input++, val;
1198
1199       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1200         val = (intptr_t) *input++;
1201       else
1202         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1203
1204       val = (val > INT_MAX) ? INT_MAX : val;
1205
1206       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1207           && ((id & GOMP_TARGET_ARG_ID_MASK)
1208               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1209         {
1210           grid_attrs_found = true;
1211           break;
1212         }
1213       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1214                == GOMP_TARGET_ARG_DEVICE_ALL)
1215         {
1216           gcn_dims_found = true;
1217           switch (id & GOMP_TARGET_ARG_ID_MASK)
1218             {
1219             case GOMP_TARGET_ARG_NUM_TEAMS:
1220               gcn_teams = limit_teams (val, agent);
1221               break;
1222             case GOMP_TARGET_ARG_THREAD_LIMIT:
1223               gcn_threads = limit_worker_threads (val);
1224               break;
1225             default:
1226               ;
1227             }
1228         }
1229     }
1230
1231   if (gcn_dims_found)
1232     {
1233       bool gfx900_workaround_p = false;
1234
1235       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1236           && gcn_threads == 0 && override_z_dim == 0)
1237         {
1238           gfx900_workaround_p = true;
1239           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1240                        "threads to at most 4 per team.\n");
1241           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1242                        "GCN_NUM_THREADS=16\n");
1243         }
1244
1245       /* Ideally, when a dimension isn't explicitly specified, we should
1246          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1247          In practice, we tune for peak performance on BabelStream, which
1248          for OpenACC is currently 32 threads per CU.  */
1249       def->ndim = 3;
1250       if (gcn_teams <= 0 && gcn_threads <= 0)
1251         {
1252           /* Set up a reasonable number of teams and threads.  */
1253           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1254           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1255           def->gdims[2] = gcn_threads;
1256         }
1257       else if (gcn_teams <= 0 && gcn_threads > 0)
1258         {
1259           /* Auto-scale the number of teams with the number of threads.  */
1260           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1261           def->gdims[2] = gcn_threads;
1262         }
1263       else if (gcn_teams > 0 && gcn_threads <= 0)
1264         {
1265           int max_threads = gfx900_workaround_p ? 4 : 16;
1266
1267           /* Auto-scale the number of threads with the number of teams.  */
1268           def->gdims[0] = gcn_teams;
1269           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1270           if (def->gdims[2] == 0)
1271             def->gdims[2] = 1;
1272           else if (def->gdims[2] > max_threads)
1273             def->gdims[2] = max_threads;
1274         }
1275       else
1276         {
1277           def->gdims[0] = gcn_teams;
1278           def->gdims[2] = gcn_threads;
1279         }
1280       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1281       def->wdims[0] = 1;  /* Single team per work-group.  */
1282       def->wdims[1] = 64;
1283       def->wdims[2] = 16;
1284       *result = def;
1285       return true;
1286     }
1287   else if (!grid_attrs_found)
1288     {
1289       def->ndim = 1;
1290       def->gdims[0] = 1;
1291       def->gdims[1] = 1;
1292       def->gdims[2] = 1;
1293       def->wdims[0] = 1;
1294       def->wdims[1] = 1;
1295       def->wdims[2] = 1;
1296       *result = def;
1297       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1298       return true;
1299     }
1300
1301   struct GOMP_kernel_launch_attributes *kla;
1302   kla = (struct GOMP_kernel_launch_attributes *) *input;
1303   *result = kla;
1304   if (kla->ndim == 0 || kla->ndim > 3)
1305     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1306
1307   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1308   unsigned i;
1309   for (i = 0; i < kla->ndim; i++)
1310     {
1311       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1312                  kla->gdims[i], kla->wdims[i]);
1313       if (kla->gdims[i] == 0)
1314         return false;
1315     }
1316   return true;
1317 }
1318
1319 /* Return the group size given the requested GROUP size, GRID size and number
1320    of grid dimensions NDIM.  */
1321
1322 static uint32_t
1323 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1324 {
1325   if (group == 0)
1326     {
1327       /* TODO: Provide a default via environment or device characteristics.  */
1328       if (ndim == 1)
1329         group = 64;
1330       else if (ndim == 2)
1331         group = 8;
1332       else
1333         group = 4;
1334     }
1335
1336   if (group > grid)
1337     group = grid;
1338   return group;
1339 }
1340
1341 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1342
1343 static void
1344 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1345 {
1346   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1347 }
1348
1349 /* A never-called callback for the HSA command queues.  These signal events
1350    that we don't use, so we trigger an error.
1351
1352    This "queue" is not to be confused with the async queues, below.  */
1353
1354 static void
1355 hsa_queue_callback (hsa_status_t status,
1356                 hsa_queue_t *queue __attribute__ ((unused)),
1357                 void *data __attribute__ ((unused)))
1358 {
1359   hsa_fatal ("Asynchronous queue error", status);
1360 }
1361
1362 /* }}}  */
1363 /* {{{ HSA initialization  */
1364
1365 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1366
1367 static bool
1368 init_hsa_runtime_functions (void)
1369 {
1370 #define DLSYM_FN(function) \
1371   hsa_fns.function##_fn = dlsym (handle, #function); \
1372   if (hsa_fns.function##_fn == NULL) \
1373     return false;
1374   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1375   if (handle == NULL)
1376     return false;
1377
1378   DLSYM_FN (hsa_status_string)
1379   DLSYM_FN (hsa_system_get_info)
1380   DLSYM_FN (hsa_agent_get_info)
1381   DLSYM_FN (hsa_init)
1382   DLSYM_FN (hsa_iterate_agents)
1383   DLSYM_FN (hsa_region_get_info)
1384   DLSYM_FN (hsa_queue_create)
1385   DLSYM_FN (hsa_agent_iterate_regions)
1386   DLSYM_FN (hsa_executable_destroy)
1387   DLSYM_FN (hsa_executable_create)
1388   DLSYM_FN (hsa_executable_global_variable_define)
1389   DLSYM_FN (hsa_executable_load_code_object)
1390   DLSYM_FN (hsa_executable_freeze)
1391   DLSYM_FN (hsa_signal_create)
1392   DLSYM_FN (hsa_memory_allocate)
1393   DLSYM_FN (hsa_memory_assign_agent)
1394   DLSYM_FN (hsa_memory_copy)
1395   DLSYM_FN (hsa_memory_free)
1396   DLSYM_FN (hsa_signal_destroy)
1397   DLSYM_FN (hsa_executable_get_symbol)
1398   DLSYM_FN (hsa_executable_symbol_get_info)
1399   DLSYM_FN (hsa_executable_iterate_symbols)
1400   DLSYM_FN (hsa_queue_add_write_index_release)
1401   DLSYM_FN (hsa_queue_load_read_index_acquire)
1402   DLSYM_FN (hsa_signal_wait_acquire)
1403   DLSYM_FN (hsa_signal_store_relaxed)
1404   DLSYM_FN (hsa_signal_store_release)
1405   DLSYM_FN (hsa_signal_load_acquire)
1406   DLSYM_FN (hsa_queue_destroy)
1407   DLSYM_FN (hsa_code_object_deserialize)
1408   return true;
1409 #undef DLSYM_FN
1410 }
1411
1412 /* Return true if the agent is a GPU and can accept of concurrent submissions
1413    from different threads.  */
1414
1415 static bool
1416 suitable_hsa_agent_p (hsa_agent_t agent)
1417 {
1418   hsa_device_type_t device_type;
1419   hsa_status_t status
1420     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1421                                      &device_type);
1422   if (status != HSA_STATUS_SUCCESS)
1423     return false;
1424
1425   switch (device_type)
1426     {
1427     case HSA_DEVICE_TYPE_GPU:
1428       break;
1429     case HSA_DEVICE_TYPE_CPU:
1430       if (!support_cpu_devices)
1431         return false;
1432       break;
1433     default:
1434       return false;
1435     }
1436
1437   uint32_t features = 0;
1438   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1439                                           &features);
1440   if (status != HSA_STATUS_SUCCESS
1441       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1442     return false;
1443   hsa_queue_type_t queue_type;
1444   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1445                                           &queue_type);
1446   if (status != HSA_STATUS_SUCCESS
1447       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1448     return false;
1449
1450   return true;
1451 }
1452
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1454    agent_count in hsa_context.  */
1455
1456 static hsa_status_t
1457 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1458 {
1459   if (suitable_hsa_agent_p (agent))
1460     hsa_context.agent_count++;
1461   return HSA_STATUS_SUCCESS;
1462 }
1463
1464 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1465    id to the describing structure in the hsa context.  The index of the
1466    structure is pointed to by DATA, increment it afterwards.  */
1467
1468 static hsa_status_t
1469 assign_agent_ids (hsa_agent_t agent, void *data)
1470 {
1471   if (suitable_hsa_agent_p (agent))
1472     {
1473       int *agent_index = (int *) data;
1474       hsa_context.agents[*agent_index].id = agent;
1475       ++*agent_index;
1476     }
1477   return HSA_STATUS_SUCCESS;
1478 }
1479
1480 /* Initialize hsa_context if it has not already been done.
1481    Return TRUE on success.  */
1482
1483 static bool
1484 init_hsa_context (void)
1485 {
1486   hsa_status_t status;
1487   int agent_index = 0;
1488
1489   if (hsa_context.initialized)
1490     return true;
1491   init_environment_variables ();
1492   if (!init_hsa_runtime_functions ())
1493     {
1494       GCN_WARNING ("Run-time could not be dynamically opened\n");
1495       if (suppress_host_fallback)
1496         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1497       return false;
1498     }
1499   status = hsa_fns.hsa_init_fn ();
1500   if (status != HSA_STATUS_SUCCESS)
1501     return hsa_error ("Run-time could not be initialized", status);
1502   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1503
1504   if (debug)
1505     dump_hsa_system_info ();
1506
1507   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1508   if (status != HSA_STATUS_SUCCESS)
1509     return hsa_error ("GCN GPU devices could not be enumerated", status);
1510   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1511
1512   hsa_context.agents
1513     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1514                                   * sizeof (struct agent_info));
1515   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1516   if (status != HSA_STATUS_SUCCESS)
1517     return hsa_error ("Scanning compute agents failed", status);
1518   if (agent_index != hsa_context.agent_count)
1519     {
1520       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1521       return false;
1522     }
1523
1524   if (debug)
1525     {
1526       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1527       if (status != HSA_STATUS_SUCCESS)
1528         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1529     }
1530
1531   uint16_t minor, major;
1532   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1533                                            &minor);
1534   if (status != HSA_STATUS_SUCCESS)
1535     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1536   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1537                                            &major);
1538   if (status != HSA_STATUS_SUCCESS)
1539     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1540
1541   size_t len = sizeof hsa_context.driver_version_s;
1542   int printed = snprintf (hsa_context.driver_version_s, len,
1543                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1544                           (unsigned short int)minor);
1545   if (printed >= len)
1546     GCN_WARNING ("HSA runtime version string was truncated."
1547                  "Version %hu.%hu is too long.", (unsigned short int)major,
1548                  (unsigned short int)minor);
1549
1550   hsa_context.initialized = true;
1551   return true;
1552 }
1553
1554 /* Verify that hsa_context has already been initialized and return the
1555    agent_info structure describing device number N.  Return NULL on error.  */
1556
1557 static struct agent_info *
1558 get_agent_info (int n)
1559 {
1560   if (!hsa_context.initialized)
1561     {
1562       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1563       return NULL;
1564     }
1565   if (n >= hsa_context.agent_count)
1566     {
1567       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1568       return NULL;
1569     }
1570   if (!hsa_context.agents[n].initialized)
1571     {
1572       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1573       return NULL;
1574     }
1575   return &hsa_context.agents[n];
1576 }
1577
1578 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1579
1580    Selects (breaks at) a suitable region of type KIND.  */
1581
1582 static hsa_status_t
1583 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1584                    hsa_region_global_flag_t kind)
1585 {
1586   hsa_status_t status;
1587   hsa_region_segment_t segment;
1588
1589   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1590                                            &segment);
1591   if (status != HSA_STATUS_SUCCESS)
1592     return status;
1593   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1594     return HSA_STATUS_SUCCESS;
1595
1596   uint32_t flags;
1597   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1598                                            &flags);
1599   if (status != HSA_STATUS_SUCCESS)
1600     return status;
1601   if (flags & kind)
1602     {
1603       *retval = region;
1604       return HSA_STATUS_INFO_BREAK;
1605     }
1606   return HSA_STATUS_SUCCESS;
1607 }
1608
1609 /* Callback of hsa_agent_iterate_regions.
1610
1611    Selects a kernargs memory region.  */
1612
1613 static hsa_status_t
1614 get_kernarg_memory_region (hsa_region_t region, void *data)
1615 {
1616   return get_memory_region (region, (hsa_region_t *)data,
1617                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1618 }
1619
1620 /* Callback of hsa_agent_iterate_regions.
1621
1622    Selects a coarse-grained memory region suitable for the heap and
1623    offload data.  */
1624
1625 static hsa_status_t
1626 get_data_memory_region (hsa_region_t region, void *data)
1627 {
1628   return get_memory_region (region, (hsa_region_t *)data,
1629                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1630 }
1631
1632 static int
1633 elf_gcn_isa_field (Elf64_Ehdr *image)
1634 {
1635   return image->e_flags & EF_AMDGPU_MACH_MASK;
1636 }
1637
1638 const static char *gcn_gfx803_s = "gfx803";
1639 const static char *gcn_gfx900_s = "gfx900";
1640 const static char *gcn_gfx906_s = "gfx906";
1641 const static char *gcn_gfx908_s = "gfx908";
1642 const static char *gcn_gfx90a_s = "gfx90a";
1643 const static char *gcn_gfx1030_s = "gfx1030";
1644 const static int gcn_isa_name_len = 6;
1645
1646 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1647    support the ISA. */
1648
1649 static const char*
1650 isa_hsa_name (int isa) {
1651   switch(isa)
1652     {
1653     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1654       return gcn_gfx803_s;
1655     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1656       return gcn_gfx900_s;
1657     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1658       return gcn_gfx906_s;
1659     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1660       return gcn_gfx908_s;
1661     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1662       return gcn_gfx90a_s;
1663     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1664       return gcn_gfx1030_s;
1665     }
1666   return NULL;
1667 }
1668
1669 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1670    with -march) or NULL if we do not support the ISA.
1671    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1672
1673 static const char*
1674 isa_gcc_name (int isa) {
1675   switch(isa)
1676     {
1677     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1678       return "fiji";
1679     default:
1680       return isa_hsa_name (isa);
1681     }
1682 }
1683
1684 /* Returns the code which is used in the GCN object code to identify the ISA with
1685    the given name (as used by the HSA runtime).  */
1686
1687 static gcn_isa
1688 isa_code(const char *isa) {
1689   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1690     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1691
1692   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1693     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1694
1695   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1696     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1697
1698   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1699     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1700
1701   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1702     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1703
1704   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1705     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1706
1707   return -1;
1708 }
1709
1710 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
1711
1712 static int
1713 max_isa_vgprs (int isa)
1714 {
1715   switch (isa)
1716     {
1717     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1718     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1719     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1720     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1721     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1722       return 256;
1723     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1724       return 512;
1725     }
1726   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1727 }
1728
1729 /* }}}  */
1730 /* {{{ Run  */
1731
1732 /* Create or reuse a team arena and stack space.
1733
1734    Team arenas are used by OpenMP to avoid calling malloc multiple times
1735    while setting up each team.  This is purely a performance optimization.
1736
1737    The stack space is used by all kernels.  We must allocate it in such a
1738    way that the reverse offload implmentation can access the data.
1739
1740    Allocating this memory costs performance, so this function will reuse an
1741    existing allocation if a large enough one is idle.
1742    The memory lock is released, but not deallocated, when the kernel exits.  */
1743
1744 static void
1745 configure_ephemeral_memories (struct kernel_info *kernel,
1746                               struct kernargs_abi *kernargs, int num_teams,
1747                               int num_threads)
1748 {
1749   struct agent_info *agent = kernel->agent;
1750   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1751   struct ephemeral_memories_list *item;
1752
1753   int actual_arena_size = (kernel->kind == KIND_OPENMP
1754                            ? team_arena_size : 0);
1755   int actual_arena_total_size = actual_arena_size * num_teams;
1756   size_t size = (actual_arena_total_size
1757                  + num_teams * num_threads * stack_size);
1758
1759   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1760     {
1761       if (item->size < size)
1762         continue;
1763
1764       if (pthread_mutex_trylock (&item->in_use) == 0)
1765         break;
1766     }
1767
1768   if (!item)
1769     {
1770       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1771                  " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1772                  num_teams, num_threads, size);
1773
1774       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1775         {
1776           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1777           return;
1778         }
1779       item = malloc (sizeof (*item));
1780       item->size = size;
1781       item->next = NULL;
1782       *next_ptr = item;
1783
1784       if (pthread_mutex_init (&item->in_use, NULL))
1785         {
1786           GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1787           return;
1788         }
1789       if (pthread_mutex_lock (&item->in_use))
1790         {
1791           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1792           return;
1793         }
1794       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1795         {
1796           GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1797           return;
1798         }
1799
1800       hsa_status_t status;
1801       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1802                                                &item->address);
1803       if (status != HSA_STATUS_SUCCESS)
1804         hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1805       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1806                                                    HSA_ACCESS_PERMISSION_RW);
1807       if (status != HSA_STATUS_SUCCESS)
1808         hsa_fatal ("Could not assign arena & stack memory to device", status);
1809     }
1810
1811   kernargs->arena_ptr = (actual_arena_total_size
1812                          ? (uint64_t)item->address
1813                          : 0);
1814   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1815   kernargs->arena_size_per_team = actual_arena_size;
1816   kernargs->stack_size_per_thread = stack_size;
1817 }
1818
1819 /* Mark an ephemeral memory space available for reuse.  */
1820
1821 static void
1822 release_ephemeral_memories (struct agent_info* agent, void *address)
1823 {
1824   struct ephemeral_memories_list *item;
1825
1826   for (item = agent->ephemeral_memories_list; item; item = item->next)
1827     {
1828       if (item->address == address)
1829         {
1830           if (pthread_mutex_unlock (&item->in_use))
1831             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1832           return;
1833         }
1834     }
1835   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1836 }
1837
1838 /* Clean up all the allocated team arenas.  */
1839
1840 static bool
1841 destroy_ephemeral_memories (struct agent_info *agent)
1842 {
1843   struct ephemeral_memories_list *item, *next;
1844
1845   for (item = agent->ephemeral_memories_list; item; item = next)
1846     {
1847       next = item->next;
1848       hsa_fns.hsa_memory_free_fn (item->address);
1849       if (pthread_mutex_destroy (&item->in_use))
1850         {
1851           GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1852           return false;
1853         }
1854       free (item);
1855     }
1856   agent->ephemeral_memories_list = NULL;
1857
1858   return true;
1859 }
1860
1861 /* Allocate memory on a specified device.  */
1862
1863 static void *
1864 alloc_by_agent (struct agent_info *agent, size_t size)
1865 {
1866   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1867
1868   void *ptr;
1869   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1870                                                         size, &ptr);
1871   if (status != HSA_STATUS_SUCCESS)
1872     {
1873       hsa_error ("Could not allocate device memory", status);
1874       return NULL;
1875     }
1876
1877   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1878                                                HSA_ACCESS_PERMISSION_RW);
1879   if (status != HSA_STATUS_SUCCESS)
1880     {
1881       hsa_error ("Could not assign data memory to device", status);
1882       return NULL;
1883     }
1884
1885   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1886   bool profiling_dispatch_p
1887     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1888   if (profiling_dispatch_p)
1889     {
1890       acc_prof_info *prof_info = thr->prof_info;
1891       acc_event_info data_event_info;
1892       acc_api_info *api_info = thr->api_info;
1893
1894       prof_info->event_type = acc_ev_alloc;
1895
1896       data_event_info.data_event.event_type = prof_info->event_type;
1897       data_event_info.data_event.valid_bytes
1898         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1899       data_event_info.data_event.parent_construct
1900         = acc_construct_parallel;
1901       data_event_info.data_event.implicit = 1;
1902       data_event_info.data_event.tool_info = NULL;
1903       data_event_info.data_event.var_name = NULL;
1904       data_event_info.data_event.bytes = size;
1905       data_event_info.data_event.host_ptr = NULL;
1906       data_event_info.data_event.device_ptr = (void *) ptr;
1907
1908       api_info->device_api = acc_device_api_other;
1909
1910       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1911                                             api_info);
1912     }
1913
1914   return ptr;
1915 }
1916
1917 /* Create kernel dispatch data structure for given KERNEL, along with
1918    the necessary device signals and memory allocations.  */
1919
1920 static struct kernel_dispatch *
1921 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1922                         int num_threads)
1923 {
1924   struct agent_info *agent = kernel->agent;
1925   struct kernel_dispatch *shadow
1926     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1927
1928   shadow->agent = kernel->agent;
1929   shadow->object = kernel->object;
1930
1931   hsa_signal_t sync_signal;
1932   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1933   if (status != HSA_STATUS_SUCCESS)
1934     hsa_fatal ("Error creating the GCN sync signal", status);
1935
1936   shadow->signal = sync_signal.handle;
1937   shadow->private_segment_size = kernel->private_segment_size;
1938
1939   if (lowlat_size < 0)
1940     {
1941       /* Divide the LDS between the number of running teams.
1942          Allocate not less than is defined in the kernel metadata.  */
1943       int teams_per_cu = num_teams / get_cu_count (agent);
1944       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1945       shadow->group_segment_size
1946         = (kernel->group_segment_size > LDS_per_team
1947            ? kernel->group_segment_size
1948            : LDS_per_team);;
1949     }
1950   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1951     /* Ensure that there's space for the OpenMP libgomp data.  */
1952     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1953   else
1954     shadow->group_segment_size = (lowlat_size > 65536
1955                                   ? 65536
1956                                   : lowlat_size);
1957
1958   /* We expect kernels to request a single pointer, explicitly, and the
1959      rest of struct kernargs, implicitly.  If they request anything else
1960      then something is wrong.  */
1961   if (kernel->kernarg_segment_size > 8)
1962     {
1963       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1964       return NULL;
1965     }
1966
1967   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1968                                            sizeof (struct kernargs),
1969                                            &shadow->kernarg_address);
1970   if (status != HSA_STATUS_SUCCESS)
1971     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1972   struct kernargs *kernargs = shadow->kernarg_address;
1973
1974   /* Zero-initialize the output_data (minimum needed).  */
1975   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
1976   kernargs->output_data.next_output = 0;
1977   for (unsigned i = 0;
1978        i < (sizeof (kernargs->output_data.queue)
1979             / sizeof (kernargs->output_data.queue[0]));
1980        i++)
1981     kernargs->output_data.queue[i].written = 0;
1982   kernargs->output_data.consumed = 0;
1983
1984   /* Pass in the heap location.  */
1985   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
1986
1987   /* Create the ephemeral memory spaces.  */
1988   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
1989
1990   /* Ensure we can recognize unset return values.  */
1991   kernargs->output_data.return_value = 0xcafe0000;
1992
1993   return shadow;
1994 }
1995
1996 static void
1997 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1998                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
1999 {
2000   int dev_num = dev_num64;
2001   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2002                           NULL);
2003 }
2004
2005 /* Output any data written to console output from the kernel.  It is expected
2006    that this function is polled during kernel execution.
2007
2008    We print all entries from the last item printed to the next entry without
2009    a "written" flag.  If the "final" flag is set then it'll continue right to
2010    the end.
2011
2012    The print buffer is circular, but the from and to locations don't wrap when
2013    the buffer does, so the output limit is UINT_MAX.  The target blocks on
2014    output when the buffer is full.  */
2015
2016 static void
2017 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2018                 bool final)
2019 {
2020   unsigned int limit = (sizeof (kernargs->output_data.queue)
2021                         / sizeof (kernargs->output_data.queue[0]));
2022
2023   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2024                                        __ATOMIC_ACQUIRE);
2025   unsigned int to = kernargs->output_data.next_output;
2026
2027   if (from > to)
2028     {
2029       /* Overflow.  */
2030       if (final)
2031         printf ("GCN print buffer overflowed.\n");
2032       return;
2033     }
2034
2035   unsigned int i;
2036   for (i = from; i < to; i++)
2037     {
2038       struct printf_data *data = &kernargs->output_data.queue[i%limit];
2039
2040       if (!data->written && !final)
2041         break;
2042
2043       switch (data->type)
2044         {
2045         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2046         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2047         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2048         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2049         case 4:
2050           process_reverse_offload (data->value_u64[0], data->value_u64[1],
2051                                    data->value_u64[2], data->value_u64[3],
2052                                    data->value_u64[4], data->value_u64[5]);
2053           break;
2054         default: printf ("GCN print buffer error!\n"); break;
2055         }
2056       data->written = 0;
2057       __atomic_store_n (&kernargs->output_data.consumed, i+1,
2058                         __ATOMIC_RELEASE);
2059     }
2060   fflush (stdout);
2061 }
2062
2063 /* Release data structure created for a kernel dispatch in SHADOW argument,
2064    and clean up the signal and memory allocations.  */
2065
2066 static void
2067 release_kernel_dispatch (struct kernel_dispatch *shadow)
2068 {
2069   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2070
2071   struct kernargs *kernargs = shadow->kernarg_address;
2072   void *addr = (void *)kernargs->abi.arena_ptr;
2073   if (!addr)
2074     addr = (void *)kernargs->abi.stack_ptr;
2075   release_ephemeral_memories (shadow->agent, addr);
2076
2077   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2078
2079   hsa_signal_t s;
2080   s.handle = shadow->signal;
2081   hsa_fns.hsa_signal_destroy_fn (s);
2082
2083   free (shadow);
2084 }
2085
2086 /* Extract the properties from a kernel binary.  */
2087
2088 static void
2089 init_kernel_properties (struct kernel_info *kernel)
2090 {
2091   hsa_status_t status;
2092   struct agent_info *agent = kernel->agent;
2093   hsa_executable_symbol_t kernel_symbol;
2094   char *buf = alloca (strlen (kernel->name) + 4);
2095   sprintf (buf, "%s.kd", kernel->name);
2096   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2097                                                  buf, agent->id,
2098                                                  0, &kernel_symbol);
2099   if (status != HSA_STATUS_SUCCESS)
2100     {
2101       hsa_warn ("Could not find symbol for kernel in the code object", status);
2102       fprintf (stderr, "not found name: '%s'\n", buf);
2103       dump_executable_symbols (agent->executable);
2104       goto failure;
2105     }
2106   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2107   status = hsa_fns.hsa_executable_symbol_get_info_fn
2108     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2109   if (status != HSA_STATUS_SUCCESS)
2110     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2111   status = hsa_fns.hsa_executable_symbol_get_info_fn
2112     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2113      &kernel->kernarg_segment_size);
2114   if (status != HSA_STATUS_SUCCESS)
2115     hsa_fatal ("Could not get info about kernel argument size", status);
2116   status = hsa_fns.hsa_executable_symbol_get_info_fn
2117     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2118      &kernel->group_segment_size);
2119   if (status != HSA_STATUS_SUCCESS)
2120     hsa_fatal ("Could not get info about kernel group segment size", status);
2121   status = hsa_fns.hsa_executable_symbol_get_info_fn
2122     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2123      &kernel->private_segment_size);
2124   if (status != HSA_STATUS_SUCCESS)
2125     hsa_fatal ("Could not get info about kernel private segment size",
2126                status);
2127
2128   /* The kernel type is not known until something tries to launch it.  */
2129   kernel->kind = KIND_UNKNOWN;
2130
2131   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2132              "following segment sizes: \n", kernel->name);
2133   GCN_DEBUG ("  group_segment_size: %u\n",
2134              (unsigned) kernel->group_segment_size);
2135   GCN_DEBUG ("  private_segment_size: %u\n",
2136              (unsigned) kernel->private_segment_size);
2137   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2138              (unsigned) kernel->kernarg_segment_size);
2139   return;
2140
2141 failure:
2142   kernel->initialization_failed = true;
2143 }
2144
2145 /* Do all the work that is necessary before running KERNEL for the first time.
2146    The function assumes the program has been created, finalized and frozen by
2147    create_and_finalize_hsa_program.  */
2148
2149 static void
2150 init_kernel (struct kernel_info *kernel)
2151 {
2152   if (pthread_mutex_lock (&kernel->init_mutex))
2153     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2154   if (kernel->initialized)
2155     {
2156       if (pthread_mutex_unlock (&kernel->init_mutex))
2157         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2158                            "mutex");
2159
2160       return;
2161     }
2162
2163   init_kernel_properties (kernel);
2164
2165   if (!kernel->initialization_failed)
2166     {
2167       GCN_DEBUG ("\n");
2168
2169       kernel->initialized = true;
2170     }
2171   if (pthread_mutex_unlock (&kernel->init_mutex))
2172     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2173                        "mutex");
2174 }
2175
2176 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2177    launch attributes from KLA.
2178
2179    MODULE_LOCKED indicates that the caller already holds the lock and
2180    run_kernel need not lock it again.
2181    If AQ is NULL then agent->sync_queue will be used.  */
2182
2183 static void
2184 run_kernel (struct kernel_info *kernel, void *vars,
2185             struct GOMP_kernel_launch_attributes *kla,
2186             struct goacc_asyncqueue *aq, bool module_locked)
2187 {
2188   struct agent_info *agent = kernel->agent;
2189   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2190              kernel->description->vpgr_count);
2191
2192   /* Reduce the number of threads/workers if there are insufficient
2193      VGPRs available to run the kernels together.  */
2194   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2195     {
2196       int max_vgprs = max_isa_vgprs (agent->device_isa);
2197       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2198       int max_threads = (max_vgprs / granulated_vgprs) * 4;
2199       if (kla->gdims[2] > max_threads)
2200         {
2201           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2202                        " per team/gang - reducing to %d threads/workers.\n",
2203                        kla->gdims[2], max_threads);
2204           kla->gdims[2] = max_threads;
2205         }
2206     }
2207
2208   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2209              (aq ? aq->id : 0));
2210   GCN_DEBUG ("GCN launch attribs: gdims:[");
2211   int i;
2212   for (i = 0; i < kla->ndim; ++i)
2213     {
2214       if (i)
2215         DEBUG_PRINT (", ");
2216       DEBUG_PRINT ("%u", kla->gdims[i]);
2217     }
2218   DEBUG_PRINT ("], normalized gdims:[");
2219   for (i = 0; i < kla->ndim; ++i)
2220     {
2221       if (i)
2222         DEBUG_PRINT (", ");
2223       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2224     }
2225   DEBUG_PRINT ("], wdims:[");
2226   for (i = 0; i < kla->ndim; ++i)
2227     {
2228       if (i)
2229         DEBUG_PRINT (", ");
2230       DEBUG_PRINT ("%u", kla->wdims[i]);
2231     }
2232   DEBUG_PRINT ("]\n");
2233   DEBUG_FLUSH ();
2234
2235   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2236     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2237
2238   if (!agent->initialized)
2239     GOMP_PLUGIN_fatal ("Agent must be initialized");
2240
2241   if (!kernel->initialized)
2242     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2243
2244   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2245
2246   uint64_t index
2247     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2248   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2249
2250   /* Wait until the queue is not full before writing the packet.   */
2251   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2252          >= command_q->size)
2253     ;
2254
2255   /* Do not allow the dimensions to be overridden when running
2256      constructors or destructors.  */
2257   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2258   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2259
2260   hsa_kernel_dispatch_packet_t *packet;
2261   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2262            + index % command_q->size;
2263
2264   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2265   packet->grid_size_x = override_x ? : kla->gdims[0];
2266   packet->workgroup_size_x = get_group_size (kla->ndim,
2267                                              packet->grid_size_x,
2268                                              kla->wdims[0]);
2269
2270   if (kla->ndim >= 2)
2271     {
2272       packet->grid_size_y = kla->gdims[1];
2273       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2274                                                  kla->wdims[1]);
2275     }
2276   else
2277     {
2278       packet->grid_size_y = 1;
2279       packet->workgroup_size_y = 1;
2280     }
2281
2282   if (kla->ndim == 3)
2283     {
2284       packet->grid_size_z = limit_worker_threads (override_z
2285                                                   ? : kla->gdims[2]);
2286       packet->workgroup_size_z = get_group_size (kla->ndim,
2287                                                  packet->grid_size_z,
2288                                                  kla->wdims[2]);
2289     }
2290   else
2291     {
2292       packet->grid_size_z = 1;
2293       packet->workgroup_size_z = 1;
2294     }
2295
2296   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2297              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2298              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2299              packet->grid_size_x / packet->workgroup_size_x,
2300              packet->grid_size_y / packet->workgroup_size_y,
2301              packet->grid_size_z / packet->workgroup_size_z,
2302              packet->workgroup_size_x, packet->workgroup_size_y,
2303              packet->workgroup_size_z);
2304
2305   struct kernel_dispatch *shadow
2306     = create_kernel_dispatch (kernel, packet->grid_size_x,
2307                               packet->grid_size_z);
2308   shadow->queue = command_q;
2309
2310   if (debug)
2311     {
2312       fprintf (stderr, "\nKernel has following dependencies:\n");
2313       print_kernel_dispatch (shadow, 2);
2314     }
2315
2316   packet->private_segment_size = shadow->private_segment_size;
2317   packet->group_segment_size = shadow->group_segment_size;
2318   packet->kernel_object = shadow->object;
2319   packet->kernarg_address = shadow->kernarg_address;
2320   hsa_signal_t s;
2321   s.handle = shadow->signal;
2322   packet->completion_signal = s;
2323   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2324   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2325
2326   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2327
2328   uint16_t header;
2329   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2330   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2331   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2332
2333   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2334              agent->device_id);
2335
2336   packet_store_release ((uint32_t *) packet, header,
2337                         (uint16_t) kla->ndim
2338                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2339
2340   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2341                                        index);
2342
2343   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2344
2345   /* Root signal waits with 1ms timeout.  */
2346   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2347                                              1000 * 1000,
2348                                              HSA_WAIT_STATE_BLOCKED) != 0)
2349     {
2350       console_output (kernel, shadow->kernarg_address, false);
2351     }
2352   console_output (kernel, shadow->kernarg_address, true);
2353
2354   struct kernargs *kernargs = shadow->kernarg_address;
2355   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2356
2357   release_kernel_dispatch (shadow);
2358
2359   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2360     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2361
2362   unsigned int upper = (return_value & ~0xffff) >> 16;
2363   if (upper == 0xcafe)
2364     ; // exit not called, normal termination.
2365   else if (upper == 0xffff)
2366     ; // exit called.
2367   else
2368     {
2369       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2370                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2371                          return_value);
2372       abort ();
2373     }
2374
2375   if (upper == 0xffff)
2376     {
2377       unsigned int signal = (return_value >> 8) & 0xff;
2378
2379       if (signal == SIGABRT)
2380         {
2381           GCN_WARNING ("GCN Kernel aborted\n");
2382           abort ();
2383         }
2384       else if (signal != 0)
2385         {
2386           GCN_WARNING ("GCN Kernel received unknown signal\n");
2387           abort ();
2388         }
2389
2390       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2391       exit (return_value & 0xff);
2392     }
2393 }
2394
2395 /* }}}  */
2396 /* {{{ Load/Unload  */
2397
2398 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2399
2400 static bool
2401 init_basic_kernel_info (struct kernel_info *kernel,
2402                         struct hsa_kernel_description *d,
2403                         struct agent_info *agent,
2404                         struct module_info *module)
2405 {
2406   kernel->agent = agent;
2407   kernel->module = module;
2408   kernel->name = d->name;
2409   kernel->description = d;
2410   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2411     {
2412       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2413       return false;
2414     }
2415   return true;
2416 }
2417
2418 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2419
2420 static bool
2421 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2422 {
2423   int isa_field = elf_gcn_isa_field (image);
2424   const char* isa_s = isa_hsa_name (isa_field);
2425   if (!isa_s)
2426     {
2427       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2428       return false;
2429     }
2430
2431   if (isa_field != agent->device_isa)
2432     {
2433       char msg[120];
2434       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2435       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2436       assert (agent_isa_s);
2437       assert (agent_isa_gcc_s);
2438
2439       snprintf (msg, sizeof msg,
2440                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2441                 "Try to recompile with '-foffload-options=-march=%s'.\n",
2442                 isa_s, agent_isa_s, agent_isa_gcc_s);
2443
2444       hsa_error (msg, HSA_STATUS_ERROR);
2445       return false;
2446     }
2447
2448   return true;
2449 }
2450
2451 /* Create and finalize the program consisting of all loaded modules.  */
2452
2453 static bool
2454 create_and_finalize_hsa_program (struct agent_info *agent)
2455 {
2456   hsa_status_t status;
2457   bool res = true;
2458   if (pthread_mutex_lock (&agent->prog_mutex))
2459     {
2460       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2461       return false;
2462     }
2463   if (agent->prog_finalized)
2464     goto final;
2465
2466   status
2467     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2468                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2469                                         "", &agent->executable);
2470   if (status != HSA_STATUS_SUCCESS)
2471     {
2472       hsa_error ("Could not create GCN executable", status);
2473       goto fail;
2474     }
2475
2476   /* Load any GCN modules.  */
2477   struct module_info *module = agent->module;
2478   if (module)
2479     {
2480       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2481
2482       if (!isa_matches_agent (agent, image))
2483         goto fail;
2484
2485       hsa_code_object_t co = { 0 };
2486       status = hsa_fns.hsa_code_object_deserialize_fn
2487         (module->image_desc->gcn_image->image,
2488          module->image_desc->gcn_image->size,
2489          NULL, &co);
2490       if (status != HSA_STATUS_SUCCESS)
2491         {
2492           hsa_error ("Could not deserialize GCN code object", status);
2493           goto fail;
2494         }
2495
2496       status = hsa_fns.hsa_executable_load_code_object_fn
2497         (agent->executable, agent->id, co, "");
2498       if (status != HSA_STATUS_SUCCESS)
2499         {
2500           hsa_error ("Could not load GCN code object", status);
2501           goto fail;
2502         }
2503
2504       if (!module->heap)
2505         {
2506           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2507                                                    gcn_kernel_heap_size,
2508                                                    (void**)&module->heap);
2509           if (status != HSA_STATUS_SUCCESS)
2510             {
2511               hsa_error ("Could not allocate memory for GCN heap", status);
2512               goto fail;
2513             }
2514
2515           status = hsa_fns.hsa_memory_assign_agent_fn
2516                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2517           if (status != HSA_STATUS_SUCCESS)
2518             {
2519               hsa_error ("Could not assign GCN heap memory to device", status);
2520               goto fail;
2521             }
2522
2523           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2524                                       &gcn_kernel_heap_size,
2525                                       sizeof (gcn_kernel_heap_size));
2526         }
2527
2528     }
2529
2530   if (debug)
2531     dump_executable_symbols (agent->executable);
2532
2533   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2534   if (status != HSA_STATUS_SUCCESS)
2535     {
2536       hsa_error ("Could not freeze the GCN executable", status);
2537       goto fail;
2538     }
2539
2540 final:
2541   agent->prog_finalized = true;
2542
2543   if (pthread_mutex_unlock (&agent->prog_mutex))
2544     {
2545       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2546       res = false;
2547     }
2548
2549   return res;
2550
2551 fail:
2552   res = false;
2553   goto final;
2554 }
2555
2556 /* Free the HSA program in agent and everything associated with it and set
2557    agent->prog_finalized and the initialized flags of all kernels to false.
2558    Return TRUE on success.  */
2559
2560 static bool
2561 destroy_hsa_program (struct agent_info *agent)
2562 {
2563   if (!agent->prog_finalized)
2564     return true;
2565
2566   hsa_status_t status;
2567
2568   GCN_DEBUG ("Destroying the current GCN program.\n");
2569
2570   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2571   if (status != HSA_STATUS_SUCCESS)
2572     return hsa_error ("Could not destroy GCN executable", status);
2573
2574   if (agent->module)
2575     {
2576       int i;
2577       for (i = 0; i < agent->module->kernel_count; i++)
2578         agent->module->kernels[i].initialized = false;
2579
2580       if (agent->module->heap)
2581         {
2582           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2583           agent->module->heap = NULL;
2584         }
2585     }
2586   agent->prog_finalized = false;
2587   return true;
2588 }
2589
2590 /* Deinitialize all information associated with MODULE and kernels within
2591    it.  Return TRUE on success.  */
2592
2593 static bool
2594 destroy_module (struct module_info *module, bool locked)
2595 {
2596   /* Run destructors before destroying module.  */
2597   struct GOMP_kernel_launch_attributes kla =
2598     { 3,
2599       /* Grid size.  */
2600       { 1, 64, 1 },
2601       /* Work-group size.  */
2602       { 1, 64, 1 }
2603     };
2604
2605   if (module->fini_array_func)
2606     {
2607       init_kernel (module->fini_array_func);
2608       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2609     }
2610   module->constructors_run_p = false;
2611
2612   int i;
2613   for (i = 0; i < module->kernel_count; i++)
2614     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2615       {
2616         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2617                            "mutex");
2618         return false;
2619       }
2620
2621   return true;
2622 }
2623
2624 /* }}}  */
2625 /* {{{ Async  */
2626
2627 /* Callback of dispatch queues to report errors.  */
2628
2629 static void
2630 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2631 {
2632   struct queue_entry *entry = &aq->queue[index];
2633
2634   switch (entry->type)
2635     {
2636     case KERNEL_LAUNCH:
2637       if (DEBUG_QUEUES)
2638         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2639                    aq->agent->device_id, aq->id, index);
2640       run_kernel (entry->u.launch.kernel,
2641                   entry->u.launch.vars,
2642                   &entry->u.launch.kla, aq, false);
2643       if (DEBUG_QUEUES)
2644         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2645                    aq->agent->device_id, aq->id, index);
2646       break;
2647
2648     case CALLBACK:
2649       if (DEBUG_QUEUES)
2650         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2651                    aq->agent->device_id, aq->id, index);
2652       entry->u.callback.fn (entry->u.callback.data);
2653       if (DEBUG_QUEUES)
2654         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2655                    aq->agent->device_id, aq->id, index);
2656       break;
2657
2658     case ASYNC_WAIT:
2659       {
2660         /* FIXME: is it safe to access a placeholder that may already have
2661            been executed?  */
2662         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2663
2664         if (DEBUG_QUEUES)
2665           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2666                      aq->agent->device_id, aq->id, index);
2667
2668         pthread_mutex_lock (&placeholderp->mutex);
2669
2670         while (!placeholderp->executed)
2671           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2672
2673         pthread_mutex_unlock (&placeholderp->mutex);
2674
2675         if (pthread_cond_destroy (&placeholderp->cond))
2676           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2677
2678         if (pthread_mutex_destroy (&placeholderp->mutex))
2679           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2680
2681         if (DEBUG_QUEUES)
2682           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2683                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2684       }
2685       break;
2686
2687     case ASYNC_PLACEHOLDER:
2688       pthread_mutex_lock (&entry->u.placeholder.mutex);
2689       entry->u.placeholder.executed = 1;
2690       pthread_cond_signal (&entry->u.placeholder.cond);
2691       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2692       break;
2693
2694     default:
2695       GOMP_PLUGIN_fatal ("Unknown queue element");
2696     }
2697 }
2698
2699 /* This function is run as a thread to service an async queue in the
2700    background.  It runs continuously until the stop flag is set.  */
2701
2702 static void *
2703 drain_queue (void *thread_arg)
2704 {
2705   struct goacc_asyncqueue *aq = thread_arg;
2706
2707   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2708     {
2709       aq->drain_queue_stop = 2;
2710       return NULL;
2711     }
2712
2713   pthread_mutex_lock (&aq->mutex);
2714
2715   while (true)
2716     {
2717       if (aq->drain_queue_stop)
2718         break;
2719
2720       if (aq->queue_n > 0)
2721         {
2722           pthread_mutex_unlock (&aq->mutex);
2723           execute_queue_entry (aq, aq->queue_first);
2724
2725           pthread_mutex_lock (&aq->mutex);
2726           aq->queue_first = ((aq->queue_first + 1)
2727                              % ASYNC_QUEUE_SIZE);
2728           aq->queue_n--;
2729
2730           if (DEBUG_THREAD_SIGNAL)
2731             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2732                        aq->agent->device_id, aq->id);
2733           pthread_cond_broadcast (&aq->queue_cond_out);
2734           pthread_mutex_unlock (&aq->mutex);
2735
2736           if (DEBUG_QUEUES)
2737             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2738                        aq->id);
2739           pthread_mutex_lock (&aq->mutex);
2740         }
2741       else
2742         {
2743           if (DEBUG_THREAD_SLEEP)
2744             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2745                        aq->agent->device_id, aq->id);
2746           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2747           if (DEBUG_THREAD_SLEEP)
2748             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2749                        aq->agent->device_id, aq->id);
2750         }
2751     }
2752
2753   aq->drain_queue_stop = 2;
2754   if (DEBUG_THREAD_SIGNAL)
2755     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2756                aq->agent->device_id, aq->id);
2757   pthread_cond_broadcast (&aq->queue_cond_out);
2758   pthread_mutex_unlock (&aq->mutex);
2759
2760   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2761   return NULL;
2762 }
2763
2764 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2765    is not usually the case.  This is just a debug tool.  */
2766
2767 static void
2768 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2769 {
2770   pthread_mutex_lock (&aq->mutex);
2771
2772   while (aq->queue_n > 0)
2773     {
2774       execute_queue_entry (aq, aq->queue_first);
2775
2776       aq->queue_first = ((aq->queue_first + 1)
2777                          % ASYNC_QUEUE_SIZE);
2778       aq->queue_n--;
2779     }
2780
2781   pthread_mutex_unlock (&aq->mutex);
2782 }
2783
2784 /* Block the current thread until an async queue is writable.  The aq->mutex
2785    lock should be held on entry, and remains locked on exit.  */
2786
2787 static void
2788 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2789 {
2790   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2791     {
2792       /* Queue is full.  Wait for it to not be full.  */
2793       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2794         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2795     }
2796 }
2797
2798 /* Request an asynchronous kernel launch on the specified queue.  This
2799    may block if the queue is full, but returns without waiting for the
2800    kernel to run.  */
2801
2802 static void
2803 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2804                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2805 {
2806   assert (aq->agent == kernel->agent);
2807
2808   pthread_mutex_lock (&aq->mutex);
2809
2810   wait_for_queue_nonfull (aq);
2811
2812   int queue_last = ((aq->queue_first + aq->queue_n)
2813                     % ASYNC_QUEUE_SIZE);
2814   if (DEBUG_QUEUES)
2815     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2816                aq->id, queue_last);
2817
2818   aq->queue[queue_last].type = KERNEL_LAUNCH;
2819   aq->queue[queue_last].u.launch.kernel = kernel;
2820   aq->queue[queue_last].u.launch.vars = vars;
2821   aq->queue[queue_last].u.launch.kla = *kla;
2822
2823   aq->queue_n++;
2824
2825   if (DEBUG_THREAD_SIGNAL)
2826     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2827                aq->agent->device_id, aq->id);
2828   pthread_cond_signal (&aq->queue_cond_in);
2829
2830   pthread_mutex_unlock (&aq->mutex);
2831 }
2832
2833 /* Request an asynchronous callback on the specified queue.  The callback
2834    function will be called, with the given opaque data, from the appropriate
2835    async thread, when all previous items on that queue are complete.  */
2836
2837 static void
2838 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2839                      void *data)
2840 {
2841   pthread_mutex_lock (&aq->mutex);
2842
2843   wait_for_queue_nonfull (aq);
2844
2845   int queue_last = ((aq->queue_first + aq->queue_n)
2846                     % ASYNC_QUEUE_SIZE);
2847   if (DEBUG_QUEUES)
2848     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2849                aq->id, queue_last);
2850
2851   aq->queue[queue_last].type = CALLBACK;
2852   aq->queue[queue_last].u.callback.fn = fn;
2853   aq->queue[queue_last].u.callback.data = data;
2854
2855   aq->queue_n++;
2856
2857   if (DEBUG_THREAD_SIGNAL)
2858     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2859                aq->agent->device_id, aq->id);
2860   pthread_cond_signal (&aq->queue_cond_in);
2861
2862   pthread_mutex_unlock (&aq->mutex);
2863 }
2864
2865 /* Request that a given async thread wait for another thread (unspecified) to
2866    reach the given placeholder.  The wait will occur when all previous entries
2867    on the queue are complete.  A placeholder is effectively a kind of signal
2868    which simply sets a flag when encountered in a queue.  */
2869
2870 static void
2871 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2872                       struct placeholder *placeholderp)
2873 {
2874   pthread_mutex_lock (&aq->mutex);
2875
2876   wait_for_queue_nonfull (aq);
2877
2878   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2879   if (DEBUG_QUEUES)
2880     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2881                aq->id, queue_last);
2882
2883   aq->queue[queue_last].type = ASYNC_WAIT;
2884   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2885
2886   aq->queue_n++;
2887
2888   if (DEBUG_THREAD_SIGNAL)
2889     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2890                aq->agent->device_id, aq->id);
2891   pthread_cond_signal (&aq->queue_cond_in);
2892
2893   pthread_mutex_unlock (&aq->mutex);
2894 }
2895
2896 /* Add a placeholder into an async queue.  When the async thread reaches the
2897    placeholder it will set the "executed" flag to true and continue.
2898    Another thread may be waiting on this thread reaching the placeholder.  */
2899
2900 static struct placeholder *
2901 queue_push_placeholder (struct goacc_asyncqueue *aq)
2902 {
2903   struct placeholder *placeholderp;
2904
2905   pthread_mutex_lock (&aq->mutex);
2906
2907   wait_for_queue_nonfull (aq);
2908
2909   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2910   if (DEBUG_QUEUES)
2911     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2912                aq->id, queue_last);
2913
2914   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2915   placeholderp = &aq->queue[queue_last].u.placeholder;
2916
2917   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2918     {
2919       pthread_mutex_unlock (&aq->mutex);
2920       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2921     }
2922
2923   if (pthread_cond_init (&placeholderp->cond, NULL))
2924     {
2925       pthread_mutex_unlock (&aq->mutex);
2926       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2927     }
2928
2929   placeholderp->executed = 0;
2930
2931   aq->queue_n++;
2932
2933   if (DEBUG_THREAD_SIGNAL)
2934     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2935                aq->agent->device_id, aq->id);
2936   pthread_cond_signal (&aq->queue_cond_in);
2937
2938   pthread_mutex_unlock (&aq->mutex);
2939
2940   return placeholderp;
2941 }
2942
2943 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2944
2945 static void
2946 finalize_async_thread (struct goacc_asyncqueue *aq)
2947 {
2948   pthread_mutex_lock (&aq->mutex);
2949   if (aq->drain_queue_stop == 2)
2950     {
2951       pthread_mutex_unlock (&aq->mutex);
2952       return;
2953     }
2954
2955   aq->drain_queue_stop = 1;
2956
2957   if (DEBUG_THREAD_SIGNAL)
2958     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2959                aq->agent->device_id, aq->id);
2960   pthread_cond_signal (&aq->queue_cond_in);
2961
2962   while (aq->drain_queue_stop != 2)
2963     {
2964       if (DEBUG_THREAD_SLEEP)
2965         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2966                    " to sleep\n", aq->agent->device_id, aq->id);
2967       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2968       if (DEBUG_THREAD_SLEEP)
2969         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2970                    aq->agent->device_id, aq->id);
2971     }
2972
2973   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2974              aq->id);
2975   pthread_mutex_unlock (&aq->mutex);
2976
2977   int err = pthread_join (aq->thread_drain_queue, NULL);
2978   if (err != 0)
2979     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2980                        aq->agent->device_id, aq->id, strerror (err));
2981   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2982 }
2983
2984 /* Set up an async queue for OpenMP.  There will be only one.  The
2985    implementation simply uses an OpenACC async queue.
2986    FIXME: is this thread-safe if two threads call this function?  */
2987
2988 static void
2989 maybe_init_omp_async (struct agent_info *agent)
2990 {
2991   if (!agent->omp_async_queue)
2992     agent->omp_async_queue
2993       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2994 }
2995
2996 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2997    copies from read-only pages.  */
2998
2999 static void
3000 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3001 {
3002   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3003
3004   if (status == HSA_STATUS_SUCCESS)
3005     return;
3006
3007   /* It appears that the copy fails if the source data is in a read-only page.
3008      We can't detect that easily, so try copying the data to a temporary buffer
3009      and doing the copy again if we got an error above.  */
3010
3011   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3012                "[%p:+%d]\n", (void *) src, (int) len);
3013
3014   void *src_copy = malloc (len);
3015   memcpy (src_copy, src, len);
3016   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3017   free (src_copy);
3018   if (status != HSA_STATUS_SUCCESS)
3019     GOMP_PLUGIN_error ("memory copy failed");
3020 }
3021
3022 /* Copy data to or from a device.  This is intended for use as an async
3023    callback event.  */
3024
3025 static void
3026 copy_data (void *data_)
3027 {
3028   struct copy_data *data = (struct copy_data *)data_;
3029   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3030              data->aq->agent->device_id, data->aq->id, data->len, data->src,
3031              data->dst);
3032   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3033   free (data);
3034 }
3035
3036 /* Request an asynchronous data copy, to or from a device, on a given queue.
3037    The event will be registered as a callback.  */
3038
3039 static void
3040 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3041                  size_t len)
3042 {
3043   if (DEBUG_QUEUES)
3044     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3045                aq->agent->device_id, aq->id, len, src, dst);
3046   struct copy_data *data
3047     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3048   data->dst = dst;
3049   data->src = src;
3050   data->len = len;
3051   data->aq = aq;
3052   queue_push_callback (aq, copy_data, data);
3053 }
3054
3055 /* Return true if the given queue is currently empty.  */
3056
3057 static int
3058 queue_empty (struct goacc_asyncqueue *aq)
3059 {
3060   pthread_mutex_lock (&aq->mutex);
3061   int res = aq->queue_n == 0 ? 1 : 0;
3062   pthread_mutex_unlock (&aq->mutex);
3063
3064   return res;
3065 }
3066
3067 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3068    directive.  */
3069
3070 static void
3071 wait_queue (struct goacc_asyncqueue *aq)
3072 {
3073   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3074     {
3075       drain_queue_synchronous (aq);
3076       return;
3077     }
3078
3079   pthread_mutex_lock (&aq->mutex);
3080
3081   while (aq->queue_n > 0)
3082     {
3083       if (DEBUG_THREAD_SLEEP)
3084         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3085                    aq->agent->device_id, aq->id);
3086       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3087       if (DEBUG_THREAD_SLEEP)
3088         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3089                    aq->id);
3090     }
3091
3092   pthread_mutex_unlock (&aq->mutex);
3093   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3094 }
3095
3096 /* }}}  */
3097 /* {{{ OpenACC support  */
3098
3099 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3100
3101 static void
3102 gcn_exec (struct kernel_info *kernel,
3103           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3104           struct goacc_asyncqueue *aq)
3105 {
3106   if (!GOMP_OFFLOAD_can_run (kernel))
3107     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3108
3109   /* If we get here then this must be an OpenACC kernel.  */
3110   kernel->kind = KIND_OPENACC;
3111
3112   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3113   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3114     {
3115       struct hsa_kernel_description *d
3116         = &kernel->module->image_desc->kernel_infos[i];
3117       if (d->name == kernel->name)
3118         {
3119           hsa_kernel_desc = d;
3120           break;
3121         }
3122     }
3123
3124   /* We may have statically-determined dimensions in
3125      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3126      invocation at runtime in dims[].  We allow static dimensions to take
3127      priority over dynamic dimensions when present (non-zero).  */
3128   if (hsa_kernel_desc->oacc_dims[0] > 0)
3129     dims[0] = hsa_kernel_desc->oacc_dims[0];
3130   if (hsa_kernel_desc->oacc_dims[1] > 0)
3131     dims[1] = hsa_kernel_desc->oacc_dims[1];
3132   if (hsa_kernel_desc->oacc_dims[2] > 0)
3133     dims[2] = hsa_kernel_desc->oacc_dims[2];
3134
3135   /* Ideally, when a dimension isn't explicitly specified, we should
3136      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3137      In practice, we tune for peak performance on BabelStream, which
3138      for OpenACC is currently 32 threads per CU.  */
3139   if (dims[0] == 0 && dims[1] == 0)
3140     {
3141       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3142          number.  There isn't really a correct answer for this without a clue
3143          about the problem size, so let's do a reasonable number of workers
3144          and gangs.  */
3145
3146       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3147       dims[1] = 8; /* Workers.  */
3148     }
3149   else if (dims[0] == 0 && dims[1] > 0)
3150     {
3151       /* Auto-scale the number of gangs with the requested number of workers.  */
3152       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3153     }
3154   else if (dims[0] > 0 && dims[1] == 0)
3155     {
3156       /* Auto-scale the number of workers with the requested number of gangs.  */
3157       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3158       if (dims[1] == 0)
3159         dims[1] = 1;
3160       if (dims[1] > 16)
3161         dims[1] = 16;
3162     }
3163
3164   /* The incoming dimensions are expressed in terms of gangs, workers, and
3165      vectors.  The HSA dimensions are expressed in terms of "work-items",
3166      which means multiples of vector lanes.
3167
3168      The "grid size" specifies the size of the problem space, and the
3169      "work-group size" specifies how much of that we want a single compute
3170      unit to chew on at once.
3171
3172      The three dimensions do not really correspond to hardware, but the
3173      important thing is that the HSA runtime will launch as many
3174      work-groups as it takes to process the entire grid, and each
3175      work-group will contain as many wave-fronts as it takes to process
3176      the work-items in that group.
3177
3178      Essentially, as long as we set the Y dimension to 64 (the number of
3179      vector lanes in hardware), and the Z group size to the maximum (16),
3180      then we will get the gangs (X) and workers (Z) launched as we expect.
3181
3182      The reason for the apparent reversal of vector and worker dimension
3183      order is to do with the way the run-time distributes work-items across
3184      v1 and v2.  */
3185   struct GOMP_kernel_launch_attributes kla =
3186     {3,
3187      /* Grid size.  */
3188      {dims[0], 64, dims[1]},
3189      /* Work-group size.  */
3190      {1,       64, 16}
3191     };
3192
3193   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3194   acc_prof_info *prof_info = thr->prof_info;
3195   acc_event_info enqueue_launch_event_info;
3196   acc_api_info *api_info = thr->api_info;
3197   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3198   if (profiling_dispatch_p)
3199     {
3200       prof_info->event_type = acc_ev_enqueue_launch_start;
3201
3202       enqueue_launch_event_info.launch_event.event_type
3203         = prof_info->event_type;
3204       enqueue_launch_event_info.launch_event.valid_bytes
3205         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3206       enqueue_launch_event_info.launch_event.parent_construct
3207         = acc_construct_parallel;
3208       enqueue_launch_event_info.launch_event.implicit = 1;
3209       enqueue_launch_event_info.launch_event.tool_info = NULL;
3210       enqueue_launch_event_info.launch_event.kernel_name
3211         = (char *) kernel->name;
3212       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3213       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3214       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3215
3216       api_info->device_api = acc_device_api_other;
3217
3218       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3219         &enqueue_launch_event_info, api_info);
3220     }
3221
3222   if (!async)
3223     run_kernel (kernel, devaddrs, &kla, NULL, false);
3224   else
3225     queue_push_launch (aq, kernel, devaddrs, &kla);
3226
3227   if (profiling_dispatch_p)
3228     {
3229       prof_info->event_type = acc_ev_enqueue_launch_end;
3230       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3231       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3232                                             &enqueue_launch_event_info,
3233                                             api_info);
3234     }
3235 }
3236
3237 /* }}}  */
3238 /* {{{ Generic Plugin API  */
3239
3240 /* Return the name of the accelerator, which is "gcn".  */
3241
3242 const char *
3243 GOMP_OFFLOAD_get_name (void)
3244 {
3245   return "gcn";
3246 }
3247
3248 /* Return the specific capabilities the HSA accelerator have.  */
3249
3250 unsigned int
3251 GOMP_OFFLOAD_get_caps (void)
3252 {
3253   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3254   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3255             | GOMP_OFFLOAD_CAP_OPENACC_200;
3256 }
3257
3258 /* Identify as GCN accelerator.  */
3259
3260 int
3261 GOMP_OFFLOAD_get_type (void)
3262 {
3263   return OFFLOAD_TARGET_TYPE_GCN;
3264 }
3265
3266 /* Return the libgomp version number we're compatible with.  There is
3267    no requirement for cross-version compatibility.  */
3268
3269 unsigned
3270 GOMP_OFFLOAD_version (void)
3271 {
3272   return GOMP_VERSION;
3273 }
3274
3275 /* Return the number of GCN devices on the system.  */
3276
3277 int
3278 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3279 {
3280   if (!init_hsa_context ())
3281     return 0;
3282   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3283      devices were present.  */
3284   if (hsa_context.agent_count > 0
3285       && ((omp_requires_mask
3286            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3287                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3288     return -1;
3289   return hsa_context.agent_count;
3290 }
3291
3292 /* Initialize device (agent) number N so that it can be used for computation.
3293    Return TRUE on success.  */
3294
3295 bool
3296 GOMP_OFFLOAD_init_device (int n)
3297 {
3298   if (!init_hsa_context ())
3299     return false;
3300   if (n >= hsa_context.agent_count)
3301     {
3302       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3303       return false;
3304     }
3305   struct agent_info *agent = &hsa_context.agents[n];
3306
3307   if (agent->initialized)
3308     return true;
3309
3310   agent->device_id = n;
3311
3312   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3313     {
3314       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3315       return false;
3316     }
3317   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3318     {
3319       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3320       return false;
3321     }
3322   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3323     {
3324       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3325       return false;
3326     }
3327   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3328     {
3329       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3330       return false;
3331     }
3332   agent->async_queues = NULL;
3333   agent->omp_async_queue = NULL;
3334   agent->ephemeral_memories_list = NULL;
3335
3336   uint32_t queue_size;
3337   hsa_status_t status;
3338   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3339                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3340                                           &queue_size);
3341   if (status != HSA_STATUS_SUCCESS)
3342     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3343                       status);
3344
3345   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3346                                           &agent->name);
3347   if (status != HSA_STATUS_SUCCESS)
3348     return hsa_error ("Error querying the name of the agent", status);
3349
3350   agent->device_isa = isa_code (agent->name);
3351   if (agent->device_isa < 0)
3352     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3353
3354   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3355                                           &agent->vendor_name);
3356   if (status != HSA_STATUS_SUCCESS)
3357     return hsa_error ("Error querying the vendor name of the agent", status);
3358
3359   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3360                                         HSA_QUEUE_TYPE_MULTI,
3361                                         hsa_queue_callback, NULL, UINT32_MAX,
3362                                         UINT32_MAX, &agent->sync_queue);
3363   if (status != HSA_STATUS_SUCCESS)
3364     return hsa_error ("Error creating command queue", status);
3365
3366   agent->kernarg_region.handle = (uint64_t) -1;
3367   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3368                                                  get_kernarg_memory_region,
3369                                                  &agent->kernarg_region);
3370   if (status != HSA_STATUS_SUCCESS
3371       && status != HSA_STATUS_INFO_BREAK)
3372     hsa_error ("Scanning memory regions failed", status);
3373   if (agent->kernarg_region.handle == (uint64_t) -1)
3374     {
3375       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3376                          "arguments");
3377       return false;
3378     }
3379   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3380   dump_hsa_region (agent->kernarg_region, NULL);
3381
3382   agent->data_region.handle = (uint64_t) -1;
3383   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3384                                                  get_data_memory_region,
3385                                                  &agent->data_region);
3386   if (status != HSA_STATUS_SUCCESS
3387       && status != HSA_STATUS_INFO_BREAK)
3388     hsa_error ("Scanning memory regions failed", status);
3389   if (agent->data_region.handle == (uint64_t) -1)
3390     {
3391       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3392                          "data");
3393       return false;
3394     }
3395   GCN_DEBUG ("Selected device data memory region:\n");
3396   dump_hsa_region (agent->data_region, NULL);
3397
3398   GCN_DEBUG ("GCN agent %d initialized\n", n);
3399
3400   agent->initialized = true;
3401   return true;
3402 }
3403
3404 /* Load GCN object-code module described by struct gcn_image_desc in
3405    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3406    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3407    contain the on-device addresses of the functions for reverse offload.  To be
3408    freed by the caller.  */
3409
3410 int
3411 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3412                          struct addr_pair **target_table,
3413                          uint64_t **rev_fn_table,
3414                          uint64_t *host_ind_fn_table)
3415 {
3416   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3417     {
3418       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3419                          " (expected %u, received %u)",
3420                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3421       return -1;
3422     }
3423
3424   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3425   struct agent_info *agent;
3426   struct addr_pair *pair;
3427   struct module_info *module;
3428   struct kernel_info *kernel;
3429   int kernel_count = image_desc->kernel_count;
3430   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3431                               ? image_desc->ind_func_count : 0;
3432   unsigned var_count = image_desc->global_variable_count;
3433   /* Currently, "others" is a struct of ICVS.  */
3434   int other_count = 1;
3435
3436   agent = get_agent_info (ord);
3437   if (!agent)
3438     return -1;
3439
3440   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3441     {
3442       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3443       return -1;
3444     }
3445   if (agent->prog_finalized
3446       && !destroy_hsa_program (agent))
3447     return -1;
3448
3449   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3450   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3451   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3452   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3453   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3454                              * sizeof (struct addr_pair));
3455   *target_table = pair;
3456   module = (struct module_info *)
3457     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3458                                 + kernel_count * sizeof (struct kernel_info));
3459   module->image_desc = image_desc;
3460   module->kernel_count = kernel_count;
3461   module->heap = NULL;
3462   module->constructors_run_p = false;
3463
3464   kernel = &module->kernels[0];
3465
3466   /* Allocate memory for kernel dependencies.  */
3467   for (unsigned i = 0; i < kernel_count; i++)
3468     {
3469       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3470       if (!init_basic_kernel_info (kernel, d, agent, module))
3471         return -1;
3472       if (strcmp (d->name, "_init_array") == 0)
3473         module->init_array_func = kernel;
3474       else if (strcmp (d->name, "_fini_array") == 0)
3475         module->fini_array_func = kernel;
3476       else
3477         {
3478           pair->start = (uintptr_t) kernel;
3479           pair->end = (uintptr_t) (kernel + 1);
3480           pair++;
3481         }
3482       kernel++;
3483     }
3484
3485   agent->module = module;
3486   if (pthread_rwlock_unlock (&agent->module_rwlock))
3487     {
3488       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3489       return -1;
3490     }
3491
3492   if (!create_and_finalize_hsa_program (agent))
3493     return -1;
3494
3495   if (var_count > 0)
3496     {
3497       hsa_status_t status;
3498       hsa_executable_symbol_t var_symbol;
3499       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3500                                                      ".offload_var_table",
3501                                                      agent->id,
3502                                                      0, &var_symbol);
3503
3504       if (status != HSA_STATUS_SUCCESS)
3505         hsa_fatal ("Could not find symbol for variable in the code object",
3506                    status);
3507
3508       uint64_t var_table_addr;
3509       status = hsa_fns.hsa_executable_symbol_get_info_fn
3510         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3511          &var_table_addr);
3512       if (status != HSA_STATUS_SUCCESS)
3513         hsa_fatal ("Could not extract a variable from its symbol", status);
3514
3515       struct {
3516         uint64_t addr;
3517         uint64_t size;
3518       } var_table[var_count];
3519       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3520                              (void*)var_table_addr, sizeof (var_table));
3521
3522       for (unsigned i = 0; i < var_count; i++)
3523         {
3524           pair->start = var_table[i].addr;
3525           pair->end = var_table[i].addr + var_table[i].size;
3526           GCN_DEBUG ("Found variable at %p with size %lu\n",
3527                      (void *)var_table[i].addr, var_table[i].size);
3528           pair++;
3529         }
3530     }
3531
3532   if (ind_func_count > 0)
3533     {
3534       hsa_status_t status;
3535
3536       /* Read indirect function table from image.  */
3537       hsa_executable_symbol_t ind_funcs_symbol;
3538       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3539                                                      ".offload_ind_func_table",
3540                                                      agent->id,
3541                                                      0, &ind_funcs_symbol);
3542
3543       if (status != HSA_STATUS_SUCCESS)
3544         hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3545                    "code object", status);
3546
3547       uint64_t ind_funcs_table_addr;
3548       status = hsa_fns.hsa_executable_symbol_get_info_fn
3549         (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3550          &ind_funcs_table_addr);
3551       if (status != HSA_STATUS_SUCCESS)
3552         hsa_fatal ("Could not extract a variable from its symbol", status);
3553
3554       uint64_t ind_funcs_table[ind_func_count];
3555       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3556                              (void*) ind_funcs_table_addr,
3557                              sizeof (ind_funcs_table));
3558
3559       /* Build host->target address map for indirect functions.  */
3560       uint64_t ind_fn_map[ind_func_count * 2 + 1];
3561       for (unsigned i = 0; i < ind_func_count; i++)
3562         {
3563           ind_fn_map[i * 2] = host_ind_fn_table[i];
3564           ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3565           GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3566                      i, host_ind_fn_table[i], ind_funcs_table[i]);
3567         }
3568       ind_fn_map[ind_func_count * 2] = 0;
3569
3570       /* Write the map onto the target.  */
3571       void *map_target_addr
3572         = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3573       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3574
3575       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3576                              (void*) ind_fn_map,
3577                              sizeof (ind_fn_map));
3578
3579       /* Write address of the map onto the target.  */
3580       hsa_executable_symbol_t symbol;
3581
3582       status
3583         = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3584                                                 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3585                                                 agent->id, 0, &symbol);
3586       if (status != HSA_STATUS_SUCCESS)
3587         hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3588                    status);
3589
3590       uint64_t varptr;
3591       uint32_t varsize;
3592
3593       status = hsa_fns.hsa_executable_symbol_get_info_fn
3594         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3595          &varptr);
3596       if (status != HSA_STATUS_SUCCESS)
3597         hsa_fatal ("Could not extract a variable from its symbol", status);
3598       status = hsa_fns.hsa_executable_symbol_get_info_fn
3599         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3600         &varsize);
3601       if (status != HSA_STATUS_SUCCESS)
3602         hsa_fatal ("Could not extract a variable size from its symbol",
3603                    status);
3604
3605       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3606                  varptr, varsize);
3607
3608       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3609                              &map_target_addr,
3610                              sizeof (map_target_addr));
3611     }
3612
3613   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3614
3615   hsa_status_t status;
3616   hsa_executable_symbol_t var_symbol;
3617   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3618                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3619                                                  agent->id, 0, &var_symbol);
3620   if (status == HSA_STATUS_SUCCESS)
3621     {
3622       uint64_t varptr;
3623       uint32_t varsize;
3624
3625       status = hsa_fns.hsa_executable_symbol_get_info_fn
3626         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3627          &varptr);
3628       if (status != HSA_STATUS_SUCCESS)
3629         hsa_fatal ("Could not extract a variable from its symbol", status);
3630       status = hsa_fns.hsa_executable_symbol_get_info_fn
3631         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3632          &varsize);
3633       if (status != HSA_STATUS_SUCCESS)
3634         hsa_fatal ("Could not extract a variable size from its symbol",
3635                    status);
3636
3637       pair->start = varptr;
3638       pair->end = varptr + varsize;
3639     }
3640   else
3641     {
3642       /* The variable was not in this image.  */
3643       GCN_DEBUG ("Variable not found in image: %s\n",
3644                  XSTRING (GOMP_ADDITIONAL_ICVS));
3645       pair->start = pair->end = 0;
3646     }
3647
3648   /* Ensure that constructors are run first.  */
3649   struct GOMP_kernel_launch_attributes kla =
3650     { 3,
3651       /* Grid size.  */
3652       { 1, 64, 1 },
3653       /* Work-group size.  */
3654       { 1, 64, 1 }
3655     };
3656
3657   if (module->init_array_func)
3658     {
3659       init_kernel (module->init_array_func);
3660       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3661     }
3662   module->constructors_run_p = true;
3663
3664   /* Don't report kernels that libgomp need not know about.  */
3665   if (module->init_array_func)
3666     kernel_count--;
3667   if (module->fini_array_func)
3668     kernel_count--;
3669
3670   if (rev_fn_table != NULL && kernel_count == 0)
3671     *rev_fn_table = NULL;
3672   else if (rev_fn_table != NULL)
3673     {
3674       hsa_status_t status;
3675       hsa_executable_symbol_t var_symbol;
3676       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3677                                                      ".offload_func_table",
3678                                                      agent->id, 0, &var_symbol);
3679       if (status != HSA_STATUS_SUCCESS)
3680         hsa_fatal ("Could not find symbol for variable in the code object",
3681                    status);
3682       uint64_t fn_table_addr;
3683       status = hsa_fns.hsa_executable_symbol_get_info_fn
3684         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3685          &fn_table_addr);
3686       if (status != HSA_STATUS_SUCCESS)
3687         hsa_fatal ("Could not extract a variable from its symbol", status);
3688       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3689       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3690                              (void*) fn_table_addr,
3691                              kernel_count * sizeof (uint64_t));
3692     }
3693
3694   return kernel_count + var_count + other_count;
3695 }
3696
3697 /* Unload GCN object-code module described by struct gcn_image_desc in
3698    TARGET_DATA from agent number N.  Return TRUE on success.  */
3699
3700 bool
3701 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3702 {
3703   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3704     {
3705       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3706                          " (expected %u, received %u)",
3707                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3708       return false;
3709     }
3710
3711   struct agent_info *agent;
3712   agent = get_agent_info (n);
3713   if (!agent)
3714     return false;
3715
3716   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3717     {
3718       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3719       return false;
3720     }
3721
3722   if (!agent->module || agent->module->image_desc != target_data)
3723     {
3724       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3725                          "loaded before");
3726       return false;
3727     }
3728
3729   if (!destroy_module (agent->module, true))
3730     return false;
3731   free (agent->module);
3732   agent->module = NULL;
3733   if (!destroy_hsa_program (agent))
3734     return false;
3735   if (pthread_rwlock_unlock (&agent->module_rwlock))
3736     {
3737       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3738       return false;
3739     }
3740   return true;
3741 }
3742
3743 /* Deinitialize all information and status associated with agent number N.  We
3744    do not attempt any synchronization, assuming the user and libgomp will not
3745    attempt deinitialization of a device that is in any way being used at the
3746    same time.  Return TRUE on success.  */
3747
3748 bool
3749 GOMP_OFFLOAD_fini_device (int n)
3750 {
3751   struct agent_info *agent = get_agent_info (n);
3752   if (!agent)
3753     return false;
3754
3755   if (!agent->initialized)
3756     return true;
3757
3758   if (agent->omp_async_queue)
3759     {
3760       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3761       agent->omp_async_queue = NULL;
3762     }
3763
3764   if (agent->module)
3765     {
3766       if (!destroy_module (agent->module, false))
3767         return false;
3768       free (agent->module);
3769       agent->module = NULL;
3770     }
3771
3772   if (!destroy_ephemeral_memories (agent))
3773     return false;
3774
3775   if (!destroy_hsa_program (agent))
3776     return false;
3777
3778   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3779   if (status != HSA_STATUS_SUCCESS)
3780     return hsa_error ("Error destroying command queue", status);
3781
3782   if (pthread_mutex_destroy (&agent->prog_mutex))
3783     {
3784       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3785       return false;
3786     }
3787   if (pthread_rwlock_destroy (&agent->module_rwlock))
3788     {
3789       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3790       return false;
3791     }
3792
3793   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3794     {
3795       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3796       return false;
3797     }
3798   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3799     {
3800       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3801       return false;
3802     }
3803   agent->initialized = false;
3804   return true;
3805 }
3806
3807 /* Return true if the HSA runtime can run function FN_PTR.  */
3808
3809 bool
3810 GOMP_OFFLOAD_can_run (void *fn_ptr)
3811 {
3812   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3813
3814   init_kernel (kernel);
3815   if (kernel->initialization_failed)
3816     goto failure;
3817
3818   return true;
3819
3820 failure:
3821   if (suppress_host_fallback)
3822     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3823   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3824   return false;
3825 }
3826
3827 /* Allocate memory on device N.  */
3828
3829 void *
3830 GOMP_OFFLOAD_alloc (int n, size_t size)
3831 {
3832   struct agent_info *agent = get_agent_info (n);
3833   return alloc_by_agent (agent, size);
3834 }
3835
3836 /* Free memory from device N.  */
3837
3838 bool
3839 GOMP_OFFLOAD_free (int device, void *ptr)
3840 {
3841   GCN_DEBUG ("Freeing memory on device %d\n", device);
3842
3843   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3844   if (status != HSA_STATUS_SUCCESS)
3845     {
3846       hsa_error ("Could not free device memory", status);
3847       return false;
3848     }
3849
3850   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3851   bool profiling_dispatch_p
3852     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3853   if (profiling_dispatch_p)
3854     {
3855       acc_prof_info *prof_info = thr->prof_info;
3856       acc_event_info data_event_info;
3857       acc_api_info *api_info = thr->api_info;
3858
3859       prof_info->event_type = acc_ev_free;
3860
3861       data_event_info.data_event.event_type = prof_info->event_type;
3862       data_event_info.data_event.valid_bytes
3863         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3864       data_event_info.data_event.parent_construct
3865         = acc_construct_parallel;
3866       data_event_info.data_event.implicit = 1;
3867       data_event_info.data_event.tool_info = NULL;
3868       data_event_info.data_event.var_name = NULL;
3869       data_event_info.data_event.bytes = 0;
3870       data_event_info.data_event.host_ptr = NULL;
3871       data_event_info.data_event.device_ptr = (void *) ptr;
3872
3873       api_info->device_api = acc_device_api_other;
3874
3875       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3876                                             api_info);
3877     }
3878
3879   return true;
3880 }
3881
3882 /* Copy data from DEVICE to host.  */
3883
3884 bool
3885 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3886 {
3887   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3888              src, dst);
3889   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3890   if (status != HSA_STATUS_SUCCESS)
3891     GOMP_PLUGIN_error ("memory copy failed");
3892   return true;
3893 }
3894
3895 /* Copy data from host to DEVICE.  */
3896
3897 bool
3898 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3899 {
3900   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3901              device, dst);
3902   hsa_memory_copy_wrapper (dst, src, n);
3903   return true;
3904 }
3905
3906 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3907
3908 bool
3909 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3910 {
3911   struct gcn_thread *thread_data = gcn_thread ();
3912
3913   if (thread_data && !async_synchronous_p (thread_data->async))
3914     {
3915       struct agent_info *agent = get_agent_info (device);
3916       maybe_init_omp_async (agent);
3917       queue_push_copy (agent->omp_async_queue, dst, src, n);
3918       return true;
3919     }
3920
3921   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3922              device, src, device, dst);
3923   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3924   if (status != HSA_STATUS_SUCCESS)
3925     GOMP_PLUGIN_error ("memory copy failed");
3926   return true;
3927 }
3928
3929 /* }}}  */
3930 /* {{{ OpenMP Plugin API  */
3931
3932 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3933    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3934    to a kernel_info structure, and must have previously been loaded to the
3935    specified device.  */
3936
3937 void
3938 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3939 {
3940   struct agent_info *agent = get_agent_info (device);
3941   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3942   struct GOMP_kernel_launch_attributes def;
3943   struct GOMP_kernel_launch_attributes *kla;
3944   assert (agent == kernel->agent);
3945
3946   /* If we get here then the kernel must be OpenMP.  */
3947   kernel->kind = KIND_OPENMP;
3948
3949   if (!parse_target_attributes (args, &def, &kla, agent))
3950     {
3951       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3952       return;
3953     }
3954   run_kernel (kernel, vars, kla, NULL, false);
3955 }
3956
3957 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3958    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3959    GOMP_PLUGIN_target_task_completion when it has finished.  */
3960
3961 void
3962 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3963                         void **args, void *async_data)
3964 {
3965   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3966   struct agent_info *agent = get_agent_info (device);
3967   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3968   struct GOMP_kernel_launch_attributes def;
3969   struct GOMP_kernel_launch_attributes *kla;
3970   assert (agent == kernel->agent);
3971
3972   /* If we get here then the kernel must be OpenMP.  */
3973   kernel->kind = KIND_OPENMP;
3974
3975   if (!parse_target_attributes (args, &def, &kla, agent))
3976     {
3977       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3978       return;
3979     }
3980
3981   maybe_init_omp_async (agent);
3982   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3983   queue_push_callback (agent->omp_async_queue,
3984                        GOMP_PLUGIN_target_task_completion, async_data);
3985 }
3986
3987 /* }}} */
3988 /* {{{ OpenACC Plugin API  */
3989
3990 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3991    already-loaded KERNEL.  */
3992
3993 void
3994 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
3995                            size_t mapnum __attribute__((unused)),
3996                            void **hostaddrs __attribute__((unused)),
3997                            void **devaddrs, unsigned *dims,
3998                            void *targ_mem_desc)
3999 {
4000   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4001
4002   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4003 }
4004
4005 /* Run an asynchronous OpenACC kernel on the specified queue.  */
4006
4007 void
4008 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4009                                  size_t mapnum __attribute__((unused)),
4010                                  void **hostaddrs __attribute__((unused)),
4011                                  void **devaddrs,
4012                                  unsigned *dims, void *targ_mem_desc,
4013                                  struct goacc_asyncqueue *aq)
4014 {
4015   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4016
4017   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4018 }
4019
4020 /* Create a new asynchronous thread and queue for running future kernels.  */
4021
4022 struct goacc_asyncqueue *
4023 GOMP_OFFLOAD_openacc_async_construct (int device)
4024 {
4025   struct agent_info *agent = get_agent_info (device);
4026
4027   pthread_mutex_lock (&agent->async_queues_mutex);
4028
4029   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4030   aq->agent = get_agent_info (device);
4031   aq->prev = NULL;
4032   aq->next = agent->async_queues;
4033   if (aq->next)
4034     {
4035       aq->next->prev = aq;
4036       aq->id = aq->next->id + 1;
4037     }
4038   else
4039     aq->id = 1;
4040   agent->async_queues = aq;
4041
4042   aq->queue_first = 0;
4043   aq->queue_n = 0;
4044   aq->drain_queue_stop = 0;
4045
4046   if (pthread_mutex_init (&aq->mutex, NULL))
4047     {
4048       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4049       return false;
4050     }
4051   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4052     {
4053       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4054       return false;
4055     }
4056   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4057     {
4058       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4059       return false;
4060     }
4061
4062   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4063                                                      ASYNC_QUEUE_SIZE,
4064                                                      HSA_QUEUE_TYPE_MULTI,
4065                                                      hsa_queue_callback, NULL,
4066                                                      UINT32_MAX, UINT32_MAX,
4067                                                      &aq->hsa_queue);
4068   if (status != HSA_STATUS_SUCCESS)
4069     hsa_fatal ("Error creating command queue", status);
4070
4071   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4072   if (err != 0)
4073     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4074                        strerror (err));
4075   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4076              aq->id);
4077
4078   pthread_mutex_unlock (&agent->async_queues_mutex);
4079
4080   return aq;
4081 }
4082
4083 /* Destroy an existing asynchronous thread and queue.  Waits for any
4084    currently-running task to complete, but cancels any queued tasks.  */
4085
4086 bool
4087 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4088 {
4089   struct agent_info *agent = aq->agent;
4090
4091   finalize_async_thread (aq);
4092
4093   pthread_mutex_lock (&agent->async_queues_mutex);
4094
4095   int err;
4096   if ((err = pthread_mutex_destroy (&aq->mutex)))
4097     {
4098       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4099       goto fail;
4100     }
4101   if (pthread_cond_destroy (&aq->queue_cond_in))
4102     {
4103       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4104       goto fail;
4105     }
4106   if (pthread_cond_destroy (&aq->queue_cond_out))
4107     {
4108       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4109       goto fail;
4110     }
4111   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4112   if (status != HSA_STATUS_SUCCESS)
4113     {
4114       hsa_error ("Error destroying command queue", status);
4115       goto fail;
4116     }
4117
4118   if (aq->prev)
4119     aq->prev->next = aq->next;
4120   if (aq->next)
4121     aq->next->prev = aq->prev;
4122   if (agent->async_queues == aq)
4123     agent->async_queues = aq->next;
4124
4125   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4126
4127   free (aq);
4128   pthread_mutex_unlock (&agent->async_queues_mutex);
4129   return true;
4130
4131 fail:
4132   pthread_mutex_unlock (&agent->async_queues_mutex);
4133   return false;
4134 }
4135
4136 /* Return true if the specified async queue is currently empty.  */
4137
4138 int
4139 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4140 {
4141   return queue_empty (aq);
4142 }
4143
4144 /* Block until the specified queue has executed all its tasks and the
4145    queue is empty.  */
4146
4147 bool
4148 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4149 {
4150   wait_queue (aq);
4151   return true;
4152 }
4153
4154 /* Add a serialization point across two async queues. Any new tasks added to
4155    AQ2, after this call, will not run until all tasks on AQ1, at the time
4156    of this call, have completed.  */
4157
4158 bool
4159 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4160                                       struct goacc_asyncqueue *aq2)
4161 {
4162   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4163      scheduled to run on it up to this point.  */
4164   if (aq1 != aq2)
4165     {
4166       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4167       queue_push_asyncwait (aq2, placeholderp);
4168     }
4169   return true;
4170 }
4171
4172 /* Add an opaque callback to the given async queue.  */
4173
4174 void
4175 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4176                                            void (*fn) (void *), void *data)
4177 {
4178   queue_push_callback (aq, fn, data);
4179 }
4180
4181 /* Queue up an asynchronous data copy from host to DEVICE.  */
4182
4183 bool
4184 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4185                                      size_t n, struct goacc_asyncqueue *aq)
4186 {
4187   struct agent_info *agent = get_agent_info (device);
4188   assert (agent == aq->agent);
4189   queue_push_copy (aq, dst, src, n);
4190   return true;
4191 }
4192
4193 /* Queue up an asynchronous data copy from DEVICE to host.  */
4194
4195 bool
4196 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4197                                      size_t n, struct goacc_asyncqueue *aq)
4198 {
4199   struct agent_info *agent = get_agent_info (device);
4200   assert (agent == aq->agent);
4201   queue_push_copy (aq, dst, src, n);
4202   return true;
4203 }
4204
4205 union goacc_property_value
4206 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4207 {
4208   struct agent_info *agent = get_agent_info (device);
4209
4210   union goacc_property_value propval = { .val = 0 };
4211
4212   switch (prop)
4213     {
4214     case GOACC_PROPERTY_FREE_MEMORY:
4215       /* Not supported. */
4216       break;
4217     case GOACC_PROPERTY_MEMORY:
4218       {
4219         size_t size;
4220         hsa_region_t region = agent->data_region;
4221         hsa_status_t status =
4222           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4223         if (status == HSA_STATUS_SUCCESS)
4224           propval.val = size;
4225         break;
4226       }
4227     case GOACC_PROPERTY_NAME:
4228       propval.ptr = agent->name;
4229       break;
4230     case GOACC_PROPERTY_VENDOR:
4231       propval.ptr = agent->vendor_name;
4232       break;
4233     case GOACC_PROPERTY_DRIVER:
4234       propval.ptr = hsa_context.driver_version_s;
4235       break;
4236     }
4237
4238   return propval;
4239 }
4240
4241 /* Set up plugin-specific thread-local-data (host-side).  */
4242
4243 void *
4244 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4245 {
4246   struct gcn_thread *thread_data
4247     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4248
4249   thread_data->async = GOMP_ASYNC_SYNC;
4250
4251   return (void *) thread_data;
4252 }
4253
4254 /* Clean up plugin-specific thread-local-data.  */
4255
4256 void
4257 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4258 {
4259   free (data);
4260 }
4261
4262 /* }}} */