libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199   hsa_status_t (*hsa_amd_memory_lock_fn)
 200     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
 201      void **agent_ptr);
 202   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
 203   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
 204     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
 205      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
 206      const hsa_dim3_t *range, hsa_agent_t copy_agent,
 207      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
 208      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 209 };
 210
 211 /* Structure describing the run-time and grid properties of an HSA kernel
 212    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 213
 214 struct GOMP_kernel_launch_attributes
 215 {
 216   /* Number of dimensions the workload has.  Maximum number is 3.  */
 217   uint32_t ndim;
 218   /* Size of the grid in the three respective dimensions.  */
 219   uint32_t gdims[3];
 220   /* Size of work-groups in the respective dimensions.  */
 221   uint32_t wdims[3];
 222 };
 223
 224 /* Collection of information needed for a dispatch of a kernel from a
 225    kernel.  */
 226
 227 struct kernel_dispatch
 228 {
 229   struct agent_info *agent;
 230   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 231   void *queue;
 232   /* Pointer to a memory space used for kernel arguments passing.  */
 233   void *kernarg_address;
 234   /* Kernel object.  */
 235   uint64_t object;
 236   /* Synchronization signal used for dispatch synchronization.  */
 237   uint64_t signal;
 238   /* Private segment size.  */
 239   uint32_t private_segment_size;
 240   /* Group segment size.  */
 241   uint32_t group_segment_size;
 242 };
 243
 244 /* Structure of the kernargs segment, supporting console output.
 245
 246    This needs to match the definitions in Newlib, and the expectations
 247    in libgomp target code.  */
 248
 249 struct kernargs {
 250   struct kernargs_abi abi;
 251
 252   /* Output data.  */
 253   struct output output_data;
 254 };
 255
 256 /* A queue entry for a future asynchronous launch.  */
 257
 258 struct kernel_launch
 259 {
 260   struct kernel_info *kernel;
 261   void *vars;
 262   struct GOMP_kernel_launch_attributes kla;
 263 };
 264
 265 /* A queue entry for a future callback.  */
 266
 267 struct callback
 268 {
 269   void (*fn)(void *);
 270   void *data;
 271 };
 272
 273 /* A data struct for the copy_data callback.  */
 274
 275 struct copy_data
 276 {
 277   void *dst;
 278   const void *src;
 279   size_t len;
 280   struct goacc_asyncqueue *aq;
 281 };
 282
 283 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 284
 285 struct placeholder
 286 {
 287   int executed;
 288   pthread_cond_t cond;
 289   pthread_mutex_t mutex;
 290 };
 291
 292 /* A queue entry for a wait directive.  */
 293
 294 struct asyncwait_info
 295 {
 296   struct placeholder *placeholderp;
 297 };
 298
 299 /* Encode the type of an entry in an async queue.  */
 300
 301 enum entry_type
 302 {
 303   KERNEL_LAUNCH,
 304   CALLBACK,
 305   ASYNC_WAIT,
 306   ASYNC_PLACEHOLDER
 307 };
 308
 309 /* An entry in an async queue.  */
 310
 311 struct queue_entry
 312 {
 313   enum entry_type type;
 314   union {
 315     struct kernel_launch launch;
 316     struct callback callback;
 317     struct asyncwait_info asyncwait;
 318     struct placeholder placeholder;
 319   } u;
 320 };
 321
 322 /* An async queue header.
 323
 324    OpenMP may create one of these.
 325    OpenACC may create many.  */
 326
 327 struct goacc_asyncqueue
 328 {
 329   struct agent_info *agent;
 330   hsa_queue_t *hsa_queue;
 331
 332   pthread_t thread_drain_queue;
 333   pthread_mutex_t mutex;
 334   pthread_cond_t queue_cond_in;
 335   pthread_cond_t queue_cond_out;
 336   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 337   int queue_first;
 338   int queue_n;
 339   int drain_queue_stop;
 340
 341   int id;
 342   struct goacc_asyncqueue *prev;
 343   struct goacc_asyncqueue *next;
 344 };
 345
 346 /* Mkoffload uses this structure to describe a kernel.
 347
 348    OpenMP kernel dimensions are passed at runtime.
 349    OpenACC kernel dimensions are passed at compile time, here.  */
 350
 351 struct hsa_kernel_description
 352 {
 353   const char *name;
 354   int oacc_dims[3];  /* Only present for GCN kernels.  */
 355   int sgpr_count;
 356   int vpgr_count;
 357 };
 358
 359 /* Mkoffload uses this structure to describe an offload variable.  */
 360
 361 struct global_var_info
 362 {
 363   const char *name;
 364   void *address;
 365 };
 366
 367 /* Mkoffload uses this structure to describe all the kernels in a
 368    loadable module.  These are passed the libgomp via static constructors.  */
 369
 370 struct gcn_image_desc
 371 {
 372   struct gcn_image {
 373     size_t size;
 374     void *image;
 375   } *gcn_image;
 376   const unsigned kernel_count;
 377   struct hsa_kernel_description *kernel_infos;
 378   const unsigned ind_func_count;
 379   const unsigned global_variable_count;
 380 };
 381
 382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 383    support.
 384    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 385
 386 typedef enum {
 387   EF_AMDGPU_MACH_UNSUPPORTED = -1,
 388   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 389   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 390   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 391   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
 392   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
 393   EF_AMDGPU_MACH_AMDGCN_GFX90c = 0x032,
 394   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
 395   EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
 396   EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
 397   EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
 398 } EF_AMDGPU_MACH;
 399
 400 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 401 typedef EF_AMDGPU_MACH gcn_isa;
 402
 403 /* Description of an HSA GPU agent (device) and the program associated with
 404    it.  */
 405
 406 struct agent_info
 407 {
 408   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 409   hsa_agent_t id;
 410   /* The user-visible device number.  */
 411   int device_id;
 412   /* Whether the agent has been initialized.  The fields below are usable only
 413      if it has been.  */
 414   bool initialized;
 415
 416   /* The instruction set architecture of the device. */
 417   gcn_isa device_isa;
 418   /* Name of the agent. */
 419   char name[64];
 420   /* Name of the vendor of the agent. */
 421   char vendor_name[64];
 422   /* Command queues of the agent.  */
 423   hsa_queue_t *sync_queue;
 424   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 425   pthread_mutex_t async_queues_mutex;
 426
 427   /* The HSA memory region from which to allocate kernel arguments.  */
 428   hsa_region_t kernarg_region;
 429
 430   /* The HSA memory region from which to allocate device data.  */
 431   hsa_region_t data_region;
 432
 433   /* Allocated ephemeral memories (team arena and stack space).  */
 434   struct ephemeral_memories_list *ephemeral_memories_list;
 435   pthread_mutex_t ephemeral_memories_write_lock;
 436
 437   /* Read-write lock that protects kernels which are running or about to be run
 438      from interference with loading and unloading of images.  Needs to be
 439      locked for reading while a kernel is being run, and for writing if the
 440      list of modules is manipulated (and thus the HSA program invalidated).  */
 441   pthread_rwlock_t module_rwlock;
 442
 443   /* The module associated with this kernel.  */
 444   struct module_info *module;
 445
 446   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 447      thread should have locked agent->module_rwlock for reading before
 448      acquiring it.  */
 449   pthread_mutex_t prog_mutex;
 450   /* Flag whether the HSA program that consists of all the modules has been
 451      finalized.  */
 452   bool prog_finalized;
 453   /* HSA executable - the finalized program that is used to locate kernels.  */
 454   hsa_executable_t executable;
 455 };
 456
 457 /* Information required to identify, finalize and run any given kernel.  */
 458
 459 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 460
 461 struct kernel_info
 462 {
 463   /* Name of the kernel, required to locate it within the GCN object-code
 464      module.  */
 465   const char *name;
 466   /* The specific agent the kernel has been or will be finalized for and run
 467      on.  */
 468   struct agent_info *agent;
 469   /* The specific module where the kernel takes place.  */
 470   struct module_info *module;
 471   /* Information provided by mkoffload associated with the kernel.  */
 472   struct hsa_kernel_description *description;
 473   /* Mutex enforcing that at most once thread ever initializes a kernel for
 474      use.  A thread should have locked agent->module_rwlock for reading before
 475      acquiring it.  */
 476   pthread_mutex_t init_mutex;
 477   /* Flag indicating whether the kernel has been initialized and all fields
 478      below it contain valid data.  */
 479   bool initialized;
 480   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 481   bool initialization_failed;
 482   /* The object to be put into the dispatch queue.  */
 483   uint64_t object;
 484   /* Required size of kernel arguments.  */
 485   uint32_t kernarg_segment_size;
 486   /* Required size of group segment.  */
 487   uint32_t group_segment_size;
 488   /* Required size of private segment.  */
 489   uint32_t private_segment_size;
 490   /* Set up for OpenMP or OpenACC?  */
 491   enum offload_kind kind;
 492 };
 493
 494 /* Information about a particular GCN module, its image and kernels.  */
 495
 496 struct module_info
 497 {
 498   /* The description with which the program has registered the image.  */
 499   struct gcn_image_desc *image_desc;
 500   /* GCN heap allocation.  */
 501   struct heap *heap;
 502   /* Physical boundaries of the loaded module.  */
 503   Elf64_Addr phys_address_start;
 504   Elf64_Addr phys_address_end;
 505
 506   bool constructors_run_p;
 507   struct kernel_info *init_array_func, *fini_array_func;
 508
 509   /* Number of kernels in this module.  */
 510   int kernel_count;
 511   /* An array of kernel_info structures describing each kernel in this
 512      module.  */
 513   struct kernel_info kernels[];
 514 };
 515
 516 /* A linked list of memory arenas allocated on the device.
 517    These are used by OpenMP, as a means to optimize per-team malloc,
 518    and for host-accessible stack space.  */
 519
 520 struct ephemeral_memories_list
 521 {
 522   struct ephemeral_memories_list *next;
 523
 524   /* The size is determined by the number of teams and threads.  */
 525   size_t size;
 526   /* The device address allocated memory.  */
 527   void *address;
 528   /* A flag to prevent two asynchronous kernels trying to use the same memory.
 529      The mutex is locked until the kernel exits.  */
 530   pthread_mutex_t in_use;
 531 };
 532
 533 /* Information about the whole HSA environment and all of its agents.  */
 534
 535 struct hsa_context_info
 536 {
 537   /* Whether the structure has been initialized.  */
 538   bool initialized;
 539   /* Number of usable GPU HSA agents in the system.  */
 540   int agent_count;
 541   /* Array of agent_info structures describing the individual HSA agents.  */
 542   struct agent_info *agents;
 543   /* Driver version string. */
 544   char driver_version_s[30];
 545 };
 546
 547 /* }}}  */
 548 /* {{{ Global variables  */
 549
 550 /* Information about the whole HSA environment and all of its agents.  */
 551
 552 static struct hsa_context_info hsa_context;
 553
 554 /* HSA runtime functions that are initialized in init_hsa_context.  */
 555
 556 static struct hsa_runtime_fn_info hsa_fns;
 557
 558 /* Heap space, allocated target-side, provided for use of newlib malloc.
 559    Each module should have it's own heap allocated.
 560    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 561
 562 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 563
 564 /* Ephemeral memory sizes for each kernel launch.  */
 565
 566 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 567 static int stack_size = DEFAULT_GCN_STACK_SIZE;
 568 static int lowlat_size = -1;
 569
 570 /* Flag to decide whether print to stderr information about what is going on.
 571    Set in init_debug depending on environment variables.  */
 572
 573 static bool debug;
 574
 575 /* Flag to decide if the runtime should suppress a possible fallback to host
 576    execution.  */
 577
 578 static bool suppress_host_fallback;
 579
 580 /* Flag to locate HSA runtime shared library that is dlopened
 581    by this plug-in.  */
 582
 583 static const char *hsa_runtime_lib;
 584
 585 /* Flag to decide if the runtime should support also CPU devices (can be
 586    a simulator).  */
 587
 588 static bool support_cpu_devices;
 589
 590 /* Runtime dimension overrides.  Zero indicates default.  */
 591
 592 static int override_x_dim = 0;
 593 static int override_z_dim = 0;
 594
 595 /* }}}  */
 596 /* {{{ Debug & Diagnostic  */
 597
 598 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 599
 600 #define DEBUG_PRINT(...) \
 601   do \
 602   { \
 603     if (debug) \
 604       { \
 605         fprintf (stderr, __VA_ARGS__); \
 606       } \
 607   } \
 608   while (false);
 609
 610 /* Flush stderr if GCN_DEBUG value is set to true.  */
 611
 612 #define DEBUG_FLUSH()                           \
 613   do {                                          \
 614     if (debug)                                  \
 615       fflush (stderr);                          \
 616   } while (false)
 617
 618 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 619    is set to true.  */
 620
 621 #define DEBUG_LOG(prefix, ...)                  \
 622   do                                            \
 623     {                                           \
 624       DEBUG_PRINT (prefix);                     \
 625       DEBUG_PRINT (__VA_ARGS__);                        \
 626       DEBUG_FLUSH ();                           \
 627     } while (false)
 628
 629 /* Print a debugging message to stderr.  */
 630
 631 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 632
 633 /* Print a warning message to stderr.  */
 634
 635 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 636
 637 /* Print HSA warning STR with an HSA STATUS code.  */
 638
 639 static void
 640 hsa_warn (const char *str, hsa_status_t status)
 641 {
 642   if (!debug)
 643     return;
 644
 645   const char *hsa_error_msg = "[unknown]";
 646   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 647
 648   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 649            hsa_error_msg);
 650 }
 651
 652 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 653    and terminate execution of the current process.  */
 654
 655 static void
 656 hsa_fatal (const char *str, hsa_status_t status)
 657 {
 658   const char *hsa_error_msg = "[unknown]";
 659   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 660   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 661                      hsa_error_msg);
 662 }
 663
 664 /* Like hsa_fatal, except only report error message, and return FALSE
 665    for propagating error processing to outside of plugin.  */
 666
 667 static bool
 668 hsa_error (const char *str, hsa_status_t status)
 669 {
 670   const char *hsa_error_msg = "[unknown]";
 671   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 672   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 673                      hsa_error_msg);
 674   return false;
 675 }
 676
 677 /* Dump information about the available hardware.  */
 678
 679 static void
 680 dump_hsa_system_info (void)
 681 {
 682   hsa_status_t status;
 683
 684   hsa_endianness_t endianness;
 685   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 686                                            &endianness);
 687   if (status == HSA_STATUS_SUCCESS)
 688     switch (endianness)
 689       {
 690       case HSA_ENDIANNESS_LITTLE:
 691         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 692         break;
 693       case HSA_ENDIANNESS_BIG:
 694         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 695         break;
 696       default:
 697         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 698       }
 699   else
 700     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 701
 702   uint8_t extensions[128];
 703   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 704                                            &extensions);
 705   if (status == HSA_STATUS_SUCCESS)
 706     {
 707       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 708         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 709     }
 710   else
 711     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 712 }
 713
 714 /* Dump information about the available hardware.  */
 715
 716 static void
 717 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 718 {
 719   switch (machine_model)
 720     {
 721     case HSA_MACHINE_MODEL_SMALL:
 722       GCN_DEBUG ("%s: SMALL\n", s);
 723       break;
 724     case HSA_MACHINE_MODEL_LARGE:
 725       GCN_DEBUG ("%s: LARGE\n", s);
 726       break;
 727     default:
 728       GCN_WARNING ("%s: UNKNOWN\n", s);
 729       break;
 730     }
 731 }
 732
 733 /* Dump information about the available hardware.  */
 734
 735 static void
 736 dump_profile (hsa_profile_t profile, const char *s)
 737 {
 738   switch (profile)
 739     {
 740     case HSA_PROFILE_FULL:
 741       GCN_DEBUG ("%s: FULL\n", s);
 742       break;
 743     case HSA_PROFILE_BASE:
 744       GCN_DEBUG ("%s: BASE\n", s);
 745       break;
 746     default:
 747       GCN_WARNING ("%s: UNKNOWN\n", s);
 748       break;
 749     }
 750 }
 751
 752 /* Dump information about a device memory region.  */
 753
 754 static hsa_status_t
 755 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 756 {
 757   hsa_status_t status;
 758
 759   hsa_region_segment_t segment;
 760   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 761                                            &segment);
 762   if (status == HSA_STATUS_SUCCESS)
 763     {
 764       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 765         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 766       else if (segment == HSA_REGION_SEGMENT_READONLY)
 767         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 768       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 769         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 770       else if (segment == HSA_REGION_SEGMENT_GROUP)
 771         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 772       else
 773         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 774     }
 775   else
 776     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 777
 778   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 779     {
 780       uint32_t flags;
 781       status
 782         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 783                                           &flags);
 784       if (status == HSA_STATUS_SUCCESS)
 785         {
 786           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 787             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 788           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 789             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 790           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 791             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 792         }
 793       else
 794         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 795     }
 796
 797   size_t size;
 798   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 799   if (status == HSA_STATUS_SUCCESS)
 800     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 801   else
 802     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 803
 804   status
 805     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 806                                       &size);
 807   if (status == HSA_STATUS_SUCCESS)
 808     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 809   else
 810     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 811
 812   bool alloc_allowed;
 813   status
 814     = hsa_fns.hsa_region_get_info_fn (region,
 815                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 816                                       &alloc_allowed);
 817   if (status == HSA_STATUS_SUCCESS)
 818     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 819   else
 820     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 821
 822   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 823     return HSA_STATUS_SUCCESS;
 824
 825   status
 826     = hsa_fns.hsa_region_get_info_fn (region,
 827                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 828                                       &size);
 829   if (status == HSA_STATUS_SUCCESS)
 830     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 831   else
 832     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 833
 834   size_t align;
 835   status
 836     = hsa_fns.hsa_region_get_info_fn (region,
 837                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 838                                       &align);
 839   if (status == HSA_STATUS_SUCCESS)
 840     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 841   else
 842     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 843
 844   return HSA_STATUS_SUCCESS;
 845 }
 846
 847 /* Dump information about all the device memory regions.  */
 848
 849 static void
 850 dump_hsa_regions (hsa_agent_t agent)
 851 {
 852   hsa_status_t status;
 853   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 854                                                  dump_hsa_region,
 855                                                  NULL);
 856   if (status != HSA_STATUS_SUCCESS)
 857     hsa_error ("Dumping hsa regions failed", status);
 858 }
 859
 860 /* Dump information about the available devices.  */
 861
 862 static hsa_status_t
 863 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 864 {
 865   hsa_status_t status;
 866
 867   char buf[64];
 868   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 869                                           &buf);
 870   if (status == HSA_STATUS_SUCCESS)
 871     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 872   else
 873     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 874
 875   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 876                                           &buf);
 877   if (status == HSA_STATUS_SUCCESS)
 878     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 879   else
 880     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 881
 882   hsa_machine_model_t machine_model;
 883   status
 884     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 885                                      &machine_model);
 886   if (status == HSA_STATUS_SUCCESS)
 887     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 888   else
 889     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 890
 891   hsa_profile_t profile;
 892   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 893                                           &profile);
 894   if (status == HSA_STATUS_SUCCESS)
 895     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 896   else
 897     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 898
 899   hsa_device_type_t device_type;
 900   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 901                                           &device_type);
 902   if (status == HSA_STATUS_SUCCESS)
 903     {
 904       switch (device_type)
 905         {
 906         case HSA_DEVICE_TYPE_CPU:
 907           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 908           break;
 909         case HSA_DEVICE_TYPE_GPU:
 910           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 911           break;
 912         case HSA_DEVICE_TYPE_DSP:
 913           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 914           break;
 915         default:
 916           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 917           break;
 918         }
 919     }
 920   else
 921     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 922
 923   uint32_t cu_count;
 924   status = hsa_fns.hsa_agent_get_info_fn
 925     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 926   if (status == HSA_STATUS_SUCCESS)
 927     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 928   else
 929     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 930
 931   uint32_t size;
 932   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 933                                           &size);
 934   if (status == HSA_STATUS_SUCCESS)
 935     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 936   else
 937     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 938
 939   uint32_t max_dim;
 940   status = hsa_fns.hsa_agent_get_info_fn (agent,
 941                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 942                                           &max_dim);
 943   if (status == HSA_STATUS_SUCCESS)
 944     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 945   else
 946     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 947
 948   uint32_t max_size;
 949   status = hsa_fns.hsa_agent_get_info_fn (agent,
 950                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 951                                           &max_size);
 952   if (status == HSA_STATUS_SUCCESS)
 953     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 954   else
 955     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 956
 957   uint32_t grid_max_dim;
 958   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 959                                           &grid_max_dim);
 960   if (status == HSA_STATUS_SUCCESS)
 961     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 962   else
 963     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 964
 965   uint32_t grid_max_size;
 966   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 967                                           &grid_max_size);
 968   if (status == HSA_STATUS_SUCCESS)
 969     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 970   else
 971     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 972
 973   dump_hsa_regions (agent);
 974
 975   return HSA_STATUS_SUCCESS;
 976 }
 977
 978 /* Forward reference.  */
 979
 980 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 981
 982 /* Helper function for dump_executable_symbols.  */
 983
 984 static hsa_status_t
 985 dump_executable_symbol (hsa_executable_t executable,
 986                         hsa_executable_symbol_t symbol,
 987                         void *data __attribute__((unused)))
 988 {
 989   char *name = get_executable_symbol_name (symbol);
 990
 991   if (name)
 992     {
 993       GCN_DEBUG ("executable symbol: %s\n", name);
 994       free (name);
 995     }
 996
 997   return HSA_STATUS_SUCCESS;
 998 }
 999
1000 /* Dump all global symbol in an executable.  */
1001
1002 static void
1003 dump_executable_symbols (hsa_executable_t executable)
1004 {
1005   hsa_status_t status;
1006   status
1007     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1008                                                  dump_executable_symbol,
1009                                                  NULL);
1010   if (status != HSA_STATUS_SUCCESS)
1011     hsa_fatal ("Could not dump HSA executable symbols", status);
1012 }
1013
1014 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1015
1016 static void
1017 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1018 {
1019   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1020
1021   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1022   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1023   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1024   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1025            (void*)kernargs->abi.heap_ptr);
1026   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1027            "", (void*)kernargs->abi.arena_ptr,
1028            kernargs->abi.arena_size_per_team);
1029   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1030            "", (void*)kernargs->abi.stack_ptr,
1031            kernargs->abi.stack_size_per_thread);
1032   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1033   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1034            dispatch->private_segment_size);
1035   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1036            "", dispatch->group_segment_size);
1037   fprintf (stderr, "\n");
1038 }
1039
1040 /* }}}  */
1041 /* {{{ Utility functions  */
1042
1043 /* Cast the thread local storage to gcn_thread.  */
1044
1045 static inline struct gcn_thread *
1046 gcn_thread (void)
1047 {
1048   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1049 }
1050
1051 /* Initialize debug and suppress_host_fallback according to the environment.  */
1052
1053 static void
1054 init_environment_variables (void)
1055 {
1056   if (secure_getenv ("GCN_DEBUG"))
1057     debug = true;
1058   else
1059     debug = false;
1060
1061   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1062     suppress_host_fallback = true;
1063   else
1064     suppress_host_fallback = false;
1065
1066   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1067   if (hsa_runtime_lib == NULL)
1068     hsa_runtime_lib = "libhsa-runtime64.so.1";
1069
1070   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1071
1072   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1073   if (!x)
1074     x = secure_getenv ("GCN_NUM_GANGS");
1075   if (x)
1076     override_x_dim = atoi (x);
1077
1078   const char *z = secure_getenv ("GCN_NUM_THREADS");
1079   if (!z)
1080     z = secure_getenv ("GCN_NUM_WORKERS");
1081   if (z)
1082     override_z_dim = atoi (z);
1083
1084   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1085   if (heap)
1086     {
1087       size_t tmp = atol (heap);
1088       if (tmp)
1089         gcn_kernel_heap_size = tmp;
1090     }
1091
1092   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1093   if (arena)
1094     {
1095       int tmp = atoi (arena);
1096       if (tmp)
1097         team_arena_size = tmp;;
1098     }
1099
1100   const char *stack = secure_getenv ("GCN_STACK_SIZE");
1101   if (stack)
1102     {
1103       int tmp = atoi (stack);
1104       if (tmp)
1105         stack_size = tmp;;
1106     }
1107
1108   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1109   if (lowlat)
1110     lowlat_size = atoi (lowlat);
1111 }
1112
1113 /* Return malloc'd string with name of SYMBOL.  */
1114
1115 static char *
1116 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1117 {
1118   hsa_status_t status;
1119   char *res;
1120   uint32_t len;
1121   const hsa_executable_symbol_info_t info_name_length
1122     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1123
1124   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1125                                                       &len);
1126   if (status != HSA_STATUS_SUCCESS)
1127     {
1128       hsa_error ("Could not get length of symbol name", status);
1129       return NULL;
1130     }
1131
1132   res = GOMP_PLUGIN_malloc (len + 1);
1133
1134   const hsa_executable_symbol_info_t info_name
1135     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1136
1137   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1138
1139   if (status != HSA_STATUS_SUCCESS)
1140     {
1141       hsa_error ("Could not get symbol name", status);
1142       free (res);
1143       return NULL;
1144     }
1145
1146   res[len] = '\0';
1147
1148   return res;
1149 }
1150
1151 /* Get the number of GPU Compute Units.  */
1152
1153 static int
1154 get_cu_count (struct agent_info *agent)
1155 {
1156   uint32_t cu_count;
1157   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159   if (status == HSA_STATUS_SUCCESS)
1160     return cu_count;
1161   else
1162     return 64;  /* The usual number for older devices.  */
1163 }
1164
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166    This depends on the kernel's resource usage levels.  */
1167
1168 static int
1169 limit_worker_threads (int threads)
1170 {
1171   /* FIXME Do something more inteligent here.
1172      GCN can always run 4 threads within a Compute Unit, but
1173      more than that depends on register usage.  */
1174   if (threads > 16)
1175     threads = 16;
1176   return threads;
1177 }
1178
1179 /* This sets the maximum number of teams to twice the number of GPU Compute
1180    Units to avoid memory waste and corresponding memory access faults.  */
1181
1182 static int
1183 limit_teams (int teams, struct agent_info *agent)
1184 {
1185   int max_teams = 2 * get_cu_count (agent);
1186   if (teams > max_teams)
1187     teams = max_teams;
1188   return teams;
1189 }
1190
1191 /* Parse the target attributes INPUT provided by the compiler and return true
1192    if we should run anything all.  If INPUT is NULL, fill DEF with default
1193    values, then store INPUT or DEF into *RESULT.
1194
1195    This is used for OpenMP only.  */
1196
1197 static bool
1198 parse_target_attributes (void **input,
1199                          struct GOMP_kernel_launch_attributes *def,
1200                          struct GOMP_kernel_launch_attributes **result,
1201                          struct agent_info *agent)
1202 {
1203   if (!input)
1204     GOMP_PLUGIN_fatal ("No target arguments provided");
1205
1206   bool grid_attrs_found = false;
1207   bool gcn_dims_found = false;
1208   int gcn_teams = 0;
1209   int gcn_threads = 0;
1210   while (*input)
1211     {
1212       intptr_t id = (intptr_t) *input++, val;
1213
1214       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1215         val = (intptr_t) *input++;
1216       else
1217         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1218
1219       val = (val > INT_MAX) ? INT_MAX : val;
1220
1221       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1222           && ((id & GOMP_TARGET_ARG_ID_MASK)
1223               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1224         {
1225           grid_attrs_found = true;
1226           break;
1227         }
1228       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1229                == GOMP_TARGET_ARG_DEVICE_ALL)
1230         {
1231           gcn_dims_found = true;
1232           switch (id & GOMP_TARGET_ARG_ID_MASK)
1233             {
1234             case GOMP_TARGET_ARG_NUM_TEAMS:
1235               gcn_teams = limit_teams (val, agent);
1236               break;
1237             case GOMP_TARGET_ARG_THREAD_LIMIT:
1238               gcn_threads = limit_worker_threads (val);
1239               break;
1240             default:
1241               ;
1242             }
1243         }
1244     }
1245
1246   if (gcn_dims_found)
1247     {
1248       bool gfx900_workaround_p = false;
1249
1250       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1251           && gcn_threads == 0 && override_z_dim == 0)
1252         {
1253           gfx900_workaround_p = true;
1254           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1255                        "threads to at most 4 per team.\n");
1256           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1257                        "GCN_NUM_THREADS=16\n");
1258         }
1259
1260       /* Ideally, when a dimension isn't explicitly specified, we should
1261          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1262          In practice, we tune for peak performance on BabelStream, which
1263          for OpenACC is currently 32 threads per CU.  */
1264       def->ndim = 3;
1265       if (gcn_teams <= 0 && gcn_threads <= 0)
1266         {
1267           /* Set up a reasonable number of teams and threads.  */
1268           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1269           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1270           def->gdims[2] = gcn_threads;
1271         }
1272       else if (gcn_teams <= 0 && gcn_threads > 0)
1273         {
1274           /* Auto-scale the number of teams with the number of threads.  */
1275           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1276           def->gdims[2] = gcn_threads;
1277         }
1278       else if (gcn_teams > 0 && gcn_threads <= 0)
1279         {
1280           int max_threads = gfx900_workaround_p ? 4 : 16;
1281
1282           /* Auto-scale the number of threads with the number of teams.  */
1283           def->gdims[0] = gcn_teams;
1284           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1285           if (def->gdims[2] == 0)
1286             def->gdims[2] = 1;
1287           else if (def->gdims[2] > max_threads)
1288             def->gdims[2] = max_threads;
1289         }
1290       else
1291         {
1292           def->gdims[0] = gcn_teams;
1293           def->gdims[2] = gcn_threads;
1294         }
1295       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1296       def->wdims[0] = 1;  /* Single team per work-group.  */
1297       def->wdims[1] = 64;
1298       def->wdims[2] = 16;
1299       *result = def;
1300       return true;
1301     }
1302   else if (!grid_attrs_found)
1303     {
1304       def->ndim = 1;
1305       def->gdims[0] = 1;
1306       def->gdims[1] = 1;
1307       def->gdims[2] = 1;
1308       def->wdims[0] = 1;
1309       def->wdims[1] = 1;
1310       def->wdims[2] = 1;
1311       *result = def;
1312       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1313       return true;
1314     }
1315
1316   struct GOMP_kernel_launch_attributes *kla;
1317   kla = (struct GOMP_kernel_launch_attributes *) *input;
1318   *result = kla;
1319   if (kla->ndim == 0 || kla->ndim > 3)
1320     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1321
1322   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1323   unsigned i;
1324   for (i = 0; i < kla->ndim; i++)
1325     {
1326       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1327                  kla->gdims[i], kla->wdims[i]);
1328       if (kla->gdims[i] == 0)
1329         return false;
1330     }
1331   return true;
1332 }
1333
1334 /* Return the group size given the requested GROUP size, GRID size and number
1335    of grid dimensions NDIM.  */
1336
1337 static uint32_t
1338 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1339 {
1340   if (group == 0)
1341     {
1342       /* TODO: Provide a default via environment or device characteristics.  */
1343       if (ndim == 1)
1344         group = 64;
1345       else if (ndim == 2)
1346         group = 8;
1347       else
1348         group = 4;
1349     }
1350
1351   if (group > grid)
1352     group = grid;
1353   return group;
1354 }
1355
1356 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1357
1358 static void
1359 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1360 {
1361   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1362 }
1363
1364 /* A never-called callback for the HSA command queues.  These signal events
1365    that we don't use, so we trigger an error.
1366
1367    This "queue" is not to be confused with the async queues, below.  */
1368
1369 static void
1370 hsa_queue_callback (hsa_status_t status,
1371                 hsa_queue_t *queue __attribute__ ((unused)),
1372                 void *data __attribute__ ((unused)))
1373 {
1374   hsa_fatal ("Asynchronous queue error", status);
1375 }
1376
1377 /* }}}  */
1378 /* {{{ HSA initialization  */
1379
1380 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1381
1382 static bool
1383 init_hsa_runtime_functions (void)
1384 {
1385 #define DLSYM_FN(function) \
1386   hsa_fns.function##_fn = dlsym (handle, #function); \
1387   if (hsa_fns.function##_fn == NULL) \
1388     GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1389 #define DLSYM_OPT_FN(function) \
1390   hsa_fns.function##_fn = dlsym (handle, #function);
1391
1392   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1393   if (handle == NULL)
1394     return false;
1395
1396   DLSYM_FN (hsa_status_string)
1397   DLSYM_FN (hsa_system_get_info)
1398   DLSYM_FN (hsa_agent_get_info)
1399   DLSYM_FN (hsa_init)
1400   DLSYM_FN (hsa_iterate_agents)
1401   DLSYM_FN (hsa_region_get_info)
1402   DLSYM_FN (hsa_queue_create)
1403   DLSYM_FN (hsa_agent_iterate_regions)
1404   DLSYM_FN (hsa_executable_destroy)
1405   DLSYM_FN (hsa_executable_create)
1406   DLSYM_FN (hsa_executable_global_variable_define)
1407   DLSYM_FN (hsa_executable_load_code_object)
1408   DLSYM_FN (hsa_executable_freeze)
1409   DLSYM_FN (hsa_signal_create)
1410   DLSYM_FN (hsa_memory_allocate)
1411   DLSYM_FN (hsa_memory_assign_agent)
1412   DLSYM_FN (hsa_memory_copy)
1413   DLSYM_FN (hsa_memory_free)
1414   DLSYM_FN (hsa_signal_destroy)
1415   DLSYM_FN (hsa_executable_get_symbol)
1416   DLSYM_FN (hsa_executable_symbol_get_info)
1417   DLSYM_FN (hsa_executable_iterate_symbols)
1418   DLSYM_FN (hsa_queue_add_write_index_release)
1419   DLSYM_FN (hsa_queue_load_read_index_acquire)
1420   DLSYM_FN (hsa_signal_wait_acquire)
1421   DLSYM_FN (hsa_signal_store_relaxed)
1422   DLSYM_FN (hsa_signal_store_release)
1423   DLSYM_FN (hsa_signal_load_acquire)
1424   DLSYM_FN (hsa_queue_destroy)
1425   DLSYM_FN (hsa_code_object_deserialize)
1426   DLSYM_OPT_FN (hsa_amd_memory_lock)
1427   DLSYM_OPT_FN (hsa_amd_memory_unlock)
1428   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1429   return true;
1430 #undef DLSYM_OPT_FN
1431 #undef DLSYM_FN
1432 }
1433
1434 static gcn_isa isa_code (const char *isa);
1435
1436 /* Return true if the agent is a GPU and can accept of concurrent submissions
1437    from different threads.  */
1438
1439 static bool
1440 suitable_hsa_agent_p (hsa_agent_t agent)
1441 {
1442   hsa_device_type_t device_type;
1443   hsa_status_t status
1444     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1445                                      &device_type);
1446   if (status != HSA_STATUS_SUCCESS)
1447     return false;
1448
1449   switch (device_type)
1450     {
1451     case HSA_DEVICE_TYPE_GPU:
1452       {
1453         char name[64];
1454         hsa_status_t status
1455           = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1456         if (status != HSA_STATUS_SUCCESS
1457             || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1458           {
1459             GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1460                        status == HSA_STATUS_SUCCESS ? name : "invalid");
1461             return false;
1462           }
1463       }
1464       break;
1465     case HSA_DEVICE_TYPE_CPU:
1466       if (!support_cpu_devices)
1467         return false;
1468       break;
1469     default:
1470       return false;
1471     }
1472
1473   uint32_t features = 0;
1474   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1475                                           &features);
1476   if (status != HSA_STATUS_SUCCESS
1477       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1478     return false;
1479   hsa_queue_type_t queue_type;
1480   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1481                                           &queue_type);
1482   if (status != HSA_STATUS_SUCCESS
1483       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1484     return false;
1485
1486   return true;
1487 }
1488
1489 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1490    agent_count in hsa_context.  */
1491
1492 static hsa_status_t
1493 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1494 {
1495   if (suitable_hsa_agent_p (agent))
1496     hsa_context.agent_count++;
1497   return HSA_STATUS_SUCCESS;
1498 }
1499
1500 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1501    id to the describing structure in the hsa context.  The index of the
1502    structure is pointed to by DATA, increment it afterwards.  */
1503
1504 static hsa_status_t
1505 assign_agent_ids (hsa_agent_t agent, void *data)
1506 {
1507   if (suitable_hsa_agent_p (agent))
1508     {
1509       int *agent_index = (int *) data;
1510       hsa_context.agents[*agent_index].id = agent;
1511       ++*agent_index;
1512     }
1513   return HSA_STATUS_SUCCESS;
1514 }
1515
1516 /* Initialize hsa_context if it has not already been done.
1517    If !PROBE: returns TRUE on success.
1518    If PROBE: returns TRUE on success or if the plugin/device shall be silently
1519    ignored, and otherwise emits an error and returns FALSE.  */
1520
1521 static bool
1522 init_hsa_context (bool probe)
1523 {
1524   hsa_status_t status;
1525   int agent_index = 0;
1526
1527   if (hsa_context.initialized)
1528     return true;
1529   init_environment_variables ();
1530   if (!init_hsa_runtime_functions ())
1531     {
1532       const char *msg = "Run-time could not be dynamically opened";
1533       if (suppress_host_fallback)
1534         GOMP_PLUGIN_fatal ("%s\n", msg);
1535       else
1536         GCN_WARNING ("%s\n", msg);
1537       return probe ? true : false;
1538     }
1539   status = hsa_fns.hsa_init_fn ();
1540   if (status != HSA_STATUS_SUCCESS)
1541     return hsa_error ("Run-time could not be initialized", status);
1542   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1543
1544   if (debug)
1545     dump_hsa_system_info ();
1546
1547   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1548   if (status != HSA_STATUS_SUCCESS)
1549     return hsa_error ("GCN GPU devices could not be enumerated", status);
1550   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1551
1552   hsa_context.agents
1553     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1554                                   * sizeof (struct agent_info));
1555   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1556   if (status != HSA_STATUS_SUCCESS)
1557     return hsa_error ("Scanning compute agents failed", status);
1558   if (agent_index != hsa_context.agent_count)
1559     {
1560       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1561       return false;
1562     }
1563
1564   if (debug)
1565     {
1566       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1567       if (status != HSA_STATUS_SUCCESS)
1568         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1569     }
1570
1571   uint16_t minor, major;
1572   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1573                                            &minor);
1574   if (status != HSA_STATUS_SUCCESS)
1575     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1576   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1577                                            &major);
1578   if (status != HSA_STATUS_SUCCESS)
1579     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1580
1581   size_t len = sizeof hsa_context.driver_version_s;
1582   int printed = snprintf (hsa_context.driver_version_s, len,
1583                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1584                           (unsigned short int)minor);
1585   if (printed >= len)
1586     GCN_WARNING ("HSA runtime version string was truncated."
1587                  "Version %hu.%hu is too long.", (unsigned short int)major,
1588                  (unsigned short int)minor);
1589
1590   hsa_context.initialized = true;
1591   return true;
1592 }
1593
1594 /* Verify that hsa_context has already been initialized and return the
1595    agent_info structure describing device number N.  Return NULL on error.  */
1596
1597 static struct agent_info *
1598 get_agent_info (int n)
1599 {
1600   if (!hsa_context.initialized)
1601     {
1602       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1603       return NULL;
1604     }
1605   if (n >= hsa_context.agent_count)
1606     {
1607       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1608       return NULL;
1609     }
1610   if (!hsa_context.agents[n].initialized)
1611     {
1612       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1613       return NULL;
1614     }
1615   return &hsa_context.agents[n];
1616 }
1617
1618 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1619
1620    Selects (breaks at) a suitable region of type KIND.  */
1621
1622 static hsa_status_t
1623 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1624                    hsa_region_global_flag_t kind)
1625 {
1626   hsa_status_t status;
1627   hsa_region_segment_t segment;
1628
1629   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1630                                            &segment);
1631   if (status != HSA_STATUS_SUCCESS)
1632     return status;
1633   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1634     return HSA_STATUS_SUCCESS;
1635
1636   uint32_t flags;
1637   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1638                                            &flags);
1639   if (status != HSA_STATUS_SUCCESS)
1640     return status;
1641   if (flags & kind)
1642     {
1643       *retval = region;
1644       return HSA_STATUS_INFO_BREAK;
1645     }
1646   return HSA_STATUS_SUCCESS;
1647 }
1648
1649 /* Callback of hsa_agent_iterate_regions.
1650
1651    Selects a kernargs memory region.  */
1652
1653 static hsa_status_t
1654 get_kernarg_memory_region (hsa_region_t region, void *data)
1655 {
1656   return get_memory_region (region, (hsa_region_t *)data,
1657                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1658 }
1659
1660 /* Callback of hsa_agent_iterate_regions.
1661
1662    Selects a coarse-grained memory region suitable for the heap and
1663    offload data.  */
1664
1665 static hsa_status_t
1666 get_data_memory_region (hsa_region_t region, void *data)
1667 {
1668   return get_memory_region (region, (hsa_region_t *)data,
1669                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1670 }
1671
1672 static int
1673 elf_gcn_isa_field (Elf64_Ehdr *image)
1674 {
1675   return image->e_flags & EF_AMDGPU_MACH_MASK;
1676 }
1677
1678 const static char *gcn_gfx803_s = "gfx803";
1679 const static char *gcn_gfx900_s = "gfx900";
1680 const static char *gcn_gfx906_s = "gfx906";
1681 const static char *gcn_gfx908_s = "gfx908";
1682 const static char *gcn_gfx90a_s = "gfx90a";
1683 const static char *gcn_gfx90c_s = "gfx90c";
1684 const static char *gcn_gfx1030_s = "gfx1030";
1685 const static char *gcn_gfx1036_s = "gfx1036";
1686 const static char *gcn_gfx1100_s = "gfx1100";
1687 const static char *gcn_gfx1103_s = "gfx1103";
1688 const static int gcn_isa_name_len = 7;
1689
1690 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1691    support the ISA. */
1692
1693 static const char*
1694 isa_hsa_name (int isa) {
1695   switch(isa)
1696     {
1697     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1698       return gcn_gfx803_s;
1699     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1700       return gcn_gfx900_s;
1701     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1702       return gcn_gfx906_s;
1703     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1704       return gcn_gfx908_s;
1705     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1706       return gcn_gfx90a_s;
1707     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1708       return gcn_gfx90c_s;
1709     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1710       return gcn_gfx1030_s;
1711     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1712       return gcn_gfx1036_s;
1713     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1714       return gcn_gfx1100_s;
1715     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1716       return gcn_gfx1103_s;
1717     }
1718   return NULL;
1719 }
1720
1721 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1722    with -march) or NULL if we do not support the ISA.
1723    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1724
1725 static const char*
1726 isa_gcc_name (int isa) {
1727   switch(isa)
1728     {
1729     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1730       return "fiji";
1731     default:
1732       return isa_hsa_name (isa);
1733     }
1734 }
1735
1736 /* Returns the code which is used in the GCN object code to identify the ISA with
1737    the given name (as used by the HSA runtime).  */
1738
1739 static gcn_isa
1740 isa_code(const char *isa) {
1741   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1742     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1743
1744   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1745     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1746
1747   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1748     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1749
1750   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1751     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1752
1753   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1754     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1755
1756   if (!strncmp (isa, gcn_gfx90c_s, gcn_isa_name_len))
1757     return EF_AMDGPU_MACH_AMDGCN_GFX90c;
1758
1759   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1760     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1761
1762   if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
1763     return EF_AMDGPU_MACH_AMDGCN_GFX1036;
1764
1765   if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1766     return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1767
1768   if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
1769     return EF_AMDGPU_MACH_AMDGCN_GFX1103;
1770
1771   return EF_AMDGPU_MACH_UNSUPPORTED;
1772 }
1773
1774 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
1775
1776 static int
1777 max_isa_vgprs (int isa)
1778 {
1779   switch (isa)
1780     {
1781     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1782     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1783     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1784     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1785       return 256;
1786     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1787       return 512;
1788     case EF_AMDGPU_MACH_AMDGCN_GFX90c:
1789       return 256;
1790     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1791     case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1792       return 512;  /* 512 SIMD32 = 256 wavefrontsize64.  */
1793     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1794     case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1795       return 1536; /* 1536 SIMD32 = 768 wavefrontsize64.  */
1796     }
1797   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1798 }
1799
1800 /* }}}  */
1801 /* {{{ Run  */
1802
1803 /* Create or reuse a team arena and stack space.
1804
1805    Team arenas are used by OpenMP to avoid calling malloc multiple times
1806    while setting up each team.  This is purely a performance optimization.
1807
1808    The stack space is used by all kernels.  We must allocate it in such a
1809    way that the reverse offload implmentation can access the data.
1810
1811    Allocating this memory costs performance, so this function will reuse an
1812    existing allocation if a large enough one is idle.
1813    The memory lock is released, but not deallocated, when the kernel exits.  */
1814
1815 static void
1816 configure_ephemeral_memories (struct kernel_info *kernel,
1817                               struct kernargs_abi *kernargs, int num_teams,
1818                               int num_threads)
1819 {
1820   struct agent_info *agent = kernel->agent;
1821   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1822   struct ephemeral_memories_list *item;
1823
1824   int actual_arena_size = (kernel->kind == KIND_OPENMP
1825                            ? team_arena_size : 0);
1826   int actual_arena_total_size = actual_arena_size * num_teams;
1827   size_t size = (actual_arena_total_size
1828                  + num_teams * num_threads * stack_size);
1829
1830   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1831     {
1832       if (item->size < size)
1833         continue;
1834
1835       if (pthread_mutex_trylock (&item->in_use) == 0)
1836         break;
1837     }
1838
1839   if (!item)
1840     {
1841       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1842                  " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1843                  num_teams, num_threads, size);
1844
1845       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1846         {
1847           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1848           return;
1849         }
1850       item = malloc (sizeof (*item));
1851       item->size = size;
1852       item->next = NULL;
1853       *next_ptr = item;
1854
1855       if (pthread_mutex_init (&item->in_use, NULL))
1856         {
1857           GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1858           return;
1859         }
1860       if (pthread_mutex_lock (&item->in_use))
1861         {
1862           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1863           return;
1864         }
1865       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1866         {
1867           GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1868           return;
1869         }
1870
1871       hsa_status_t status;
1872       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1873                                                &item->address);
1874       if (status != HSA_STATUS_SUCCESS)
1875         hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1876       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1877                                                    HSA_ACCESS_PERMISSION_RW);
1878       if (status != HSA_STATUS_SUCCESS)
1879         hsa_fatal ("Could not assign arena & stack memory to device", status);
1880     }
1881
1882   kernargs->arena_ptr = (actual_arena_total_size
1883                          ? (uint64_t)item->address
1884                          : 0);
1885   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1886   kernargs->arena_size_per_team = actual_arena_size;
1887   kernargs->stack_size_per_thread = stack_size;
1888 }
1889
1890 /* Mark an ephemeral memory space available for reuse.  */
1891
1892 static void
1893 release_ephemeral_memories (struct agent_info* agent, void *address)
1894 {
1895   struct ephemeral_memories_list *item;
1896
1897   for (item = agent->ephemeral_memories_list; item; item = item->next)
1898     {
1899       if (item->address == address)
1900         {
1901           if (pthread_mutex_unlock (&item->in_use))
1902             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1903           return;
1904         }
1905     }
1906   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1907 }
1908
1909 /* Clean up all the allocated team arenas.  */
1910
1911 static bool
1912 destroy_ephemeral_memories (struct agent_info *agent)
1913 {
1914   struct ephemeral_memories_list *item, *next;
1915
1916   for (item = agent->ephemeral_memories_list; item; item = next)
1917     {
1918       next = item->next;
1919       hsa_fns.hsa_memory_free_fn (item->address);
1920       if (pthread_mutex_destroy (&item->in_use))
1921         {
1922           GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1923           return false;
1924         }
1925       free (item);
1926     }
1927   agent->ephemeral_memories_list = NULL;
1928
1929   return true;
1930 }
1931
1932 /* Allocate memory on a specified device.  */
1933
1934 static void *
1935 alloc_by_agent (struct agent_info *agent, size_t size)
1936 {
1937   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1938
1939   void *ptr;
1940   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1941                                                         size, &ptr);
1942   if (status != HSA_STATUS_SUCCESS)
1943     {
1944       hsa_error ("Could not allocate device memory", status);
1945       return NULL;
1946     }
1947
1948   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1949                                                HSA_ACCESS_PERMISSION_RW);
1950   if (status != HSA_STATUS_SUCCESS)
1951     {
1952       hsa_error ("Could not assign data memory to device", status);
1953       return NULL;
1954     }
1955
1956   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1957   bool profiling_dispatch_p
1958     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1959   if (profiling_dispatch_p)
1960     {
1961       acc_prof_info *prof_info = thr->prof_info;
1962       acc_event_info data_event_info;
1963       acc_api_info *api_info = thr->api_info;
1964
1965       prof_info->event_type = acc_ev_alloc;
1966
1967       data_event_info.data_event.event_type = prof_info->event_type;
1968       data_event_info.data_event.valid_bytes
1969         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1970       data_event_info.data_event.parent_construct
1971         = acc_construct_parallel;
1972       data_event_info.data_event.implicit = 1;
1973       data_event_info.data_event.tool_info = NULL;
1974       data_event_info.data_event.var_name = NULL;
1975       data_event_info.data_event.bytes = size;
1976       data_event_info.data_event.host_ptr = NULL;
1977       data_event_info.data_event.device_ptr = (void *) ptr;
1978
1979       api_info->device_api = acc_device_api_other;
1980
1981       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1982                                             api_info);
1983     }
1984
1985   return ptr;
1986 }
1987
1988 /* Create kernel dispatch data structure for given KERNEL, along with
1989    the necessary device signals and memory allocations.  */
1990
1991 static struct kernel_dispatch *
1992 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1993                         int num_threads)
1994 {
1995   struct agent_info *agent = kernel->agent;
1996   struct kernel_dispatch *shadow
1997     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1998
1999   shadow->agent = kernel->agent;
2000   shadow->object = kernel->object;
2001
2002   hsa_signal_t sync_signal;
2003   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
2004   if (status != HSA_STATUS_SUCCESS)
2005     hsa_fatal ("Error creating the GCN sync signal", status);
2006
2007   shadow->signal = sync_signal.handle;
2008   shadow->private_segment_size = kernel->private_segment_size;
2009
2010   if (lowlat_size < 0)
2011     {
2012       /* Divide the LDS between the number of running teams.
2013          Allocate not less than is defined in the kernel metadata.  */
2014       int teams_per_cu = num_teams / get_cu_count (agent);
2015       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
2016       shadow->group_segment_size
2017         = (kernel->group_segment_size > LDS_per_team
2018            ? kernel->group_segment_size
2019            : LDS_per_team);;
2020     }
2021   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
2022     /* Ensure that there's space for the OpenMP libgomp data.  */
2023     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
2024   else
2025     shadow->group_segment_size = (lowlat_size > 65536
2026                                   ? 65536
2027                                   : lowlat_size);
2028
2029   /* We expect kernels to request a single pointer, explicitly, and the
2030      rest of struct kernargs, implicitly.  If they request anything else
2031      then something is wrong.  */
2032   if (kernel->kernarg_segment_size > 8)
2033     {
2034       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2035       return NULL;
2036     }
2037
2038   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2039                                            sizeof (struct kernargs),
2040                                            &shadow->kernarg_address);
2041   if (status != HSA_STATUS_SUCCESS)
2042     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2043   struct kernargs *kernargs = shadow->kernarg_address;
2044
2045   /* Zero-initialize the output_data (minimum needed).  */
2046   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2047   kernargs->output_data.next_output = 0;
2048   for (unsigned i = 0;
2049        i < (sizeof (kernargs->output_data.queue)
2050             / sizeof (kernargs->output_data.queue[0]));
2051        i++)
2052     kernargs->output_data.queue[i].written = 0;
2053   kernargs->output_data.consumed = 0;
2054
2055   /* Pass in the heap location.  */
2056   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2057
2058   /* Create the ephemeral memory spaces.  */
2059   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2060
2061   /* Ensure we can recognize unset return values.  */
2062   kernargs->output_data.return_value = 0xcafe0000;
2063
2064   return shadow;
2065 }
2066
2067 static void
2068 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2069                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2070 {
2071   int dev_num = dev_num64;
2072   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2073                           NULL);
2074 }
2075
2076 /* Output any data written to console output from the kernel.  It is expected
2077    that this function is polled during kernel execution.
2078
2079    We print all entries from the last item printed to the next entry without
2080    a "written" flag.  If the "final" flag is set then it'll continue right to
2081    the end.
2082
2083    The print buffer is circular, but the from and to locations don't wrap when
2084    the buffer does, so the output limit is UINT_MAX.  The target blocks on
2085    output when the buffer is full.  */
2086
2087 static void
2088 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2089                 bool final)
2090 {
2091   unsigned int limit = (sizeof (kernargs->output_data.queue)
2092                         / sizeof (kernargs->output_data.queue[0]));
2093
2094   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2095                                        __ATOMIC_ACQUIRE);
2096   unsigned int to = kernargs->output_data.next_output;
2097
2098   if (from > to)
2099     {
2100       /* Overflow.  */
2101       if (final)
2102         printf ("GCN print buffer overflowed.\n");
2103       return;
2104     }
2105
2106   unsigned int i;
2107   for (i = from; i < to; i++)
2108     {
2109       struct printf_data *data = &kernargs->output_data.queue[i%limit];
2110
2111       if (!data->written && !final)
2112         break;
2113
2114       switch (data->type)
2115         {
2116         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2117         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2118         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2119         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2120         case 4:
2121           process_reverse_offload (data->value_u64[0], data->value_u64[1],
2122                                    data->value_u64[2], data->value_u64[3],
2123                                    data->value_u64[4], data->value_u64[5]);
2124           break;
2125         default: printf ("GCN print buffer error!\n"); break;
2126         }
2127       data->written = 0;
2128       __atomic_store_n (&kernargs->output_data.consumed, i+1,
2129                         __ATOMIC_RELEASE);
2130     }
2131   fflush (stdout);
2132 }
2133
2134 /* Release data structure created for a kernel dispatch in SHADOW argument,
2135    and clean up the signal and memory allocations.  */
2136
2137 static void
2138 release_kernel_dispatch (struct kernel_dispatch *shadow)
2139 {
2140   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2141
2142   struct kernargs *kernargs = shadow->kernarg_address;
2143   void *addr = (void *)kernargs->abi.arena_ptr;
2144   if (!addr)
2145     addr = (void *)kernargs->abi.stack_ptr;
2146   release_ephemeral_memories (shadow->agent, addr);
2147
2148   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2149
2150   hsa_signal_t s;
2151   s.handle = shadow->signal;
2152   hsa_fns.hsa_signal_destroy_fn (s);
2153
2154   free (shadow);
2155 }
2156
2157 /* Extract the properties from a kernel binary.  */
2158
2159 static void
2160 init_kernel_properties (struct kernel_info *kernel)
2161 {
2162   hsa_status_t status;
2163   struct agent_info *agent = kernel->agent;
2164   hsa_executable_symbol_t kernel_symbol;
2165   char *buf = alloca (strlen (kernel->name) + 4);
2166   sprintf (buf, "%s.kd", kernel->name);
2167   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2168                                                  buf, agent->id,
2169                                                  0, &kernel_symbol);
2170   if (status != HSA_STATUS_SUCCESS)
2171     {
2172       hsa_warn ("Could not find symbol for kernel in the code object", status);
2173       fprintf (stderr, "not found name: '%s'\n", buf);
2174       dump_executable_symbols (agent->executable);
2175       goto failure;
2176     }
2177   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2178   status = hsa_fns.hsa_executable_symbol_get_info_fn
2179     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2180   if (status != HSA_STATUS_SUCCESS)
2181     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2182   status = hsa_fns.hsa_executable_symbol_get_info_fn
2183     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2184      &kernel->kernarg_segment_size);
2185   if (status != HSA_STATUS_SUCCESS)
2186     hsa_fatal ("Could not get info about kernel argument size", status);
2187   status = hsa_fns.hsa_executable_symbol_get_info_fn
2188     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2189      &kernel->group_segment_size);
2190   if (status != HSA_STATUS_SUCCESS)
2191     hsa_fatal ("Could not get info about kernel group segment size", status);
2192   status = hsa_fns.hsa_executable_symbol_get_info_fn
2193     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2194      &kernel->private_segment_size);
2195   if (status != HSA_STATUS_SUCCESS)
2196     hsa_fatal ("Could not get info about kernel private segment size",
2197                status);
2198
2199   /* The kernel type is not known until something tries to launch it.  */
2200   kernel->kind = KIND_UNKNOWN;
2201
2202   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2203              "following segment sizes: \n", kernel->name);
2204   GCN_DEBUG ("  group_segment_size: %u\n",
2205              (unsigned) kernel->group_segment_size);
2206   GCN_DEBUG ("  private_segment_size: %u\n",
2207              (unsigned) kernel->private_segment_size);
2208   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2209              (unsigned) kernel->kernarg_segment_size);
2210   return;
2211
2212 failure:
2213   kernel->initialization_failed = true;
2214 }
2215
2216 /* Do all the work that is necessary before running KERNEL for the first time.
2217    The function assumes the program has been created, finalized and frozen by
2218    create_and_finalize_hsa_program.  */
2219
2220 static void
2221 init_kernel (struct kernel_info *kernel)
2222 {
2223   if (pthread_mutex_lock (&kernel->init_mutex))
2224     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2225   if (kernel->initialized)
2226     {
2227       if (pthread_mutex_unlock (&kernel->init_mutex))
2228         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2229                            "mutex");
2230
2231       return;
2232     }
2233
2234   init_kernel_properties (kernel);
2235
2236   if (!kernel->initialization_failed)
2237     {
2238       GCN_DEBUG ("\n");
2239
2240       kernel->initialized = true;
2241     }
2242   if (pthread_mutex_unlock (&kernel->init_mutex))
2243     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2244                        "mutex");
2245 }
2246
2247 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2248    launch attributes from KLA.
2249
2250    MODULE_LOCKED indicates that the caller already holds the lock and
2251    run_kernel need not lock it again.
2252    If AQ is NULL then agent->sync_queue will be used.  */
2253
2254 static void
2255 run_kernel (struct kernel_info *kernel, void *vars,
2256             struct GOMP_kernel_launch_attributes *kla,
2257             struct goacc_asyncqueue *aq, bool module_locked)
2258 {
2259   struct agent_info *agent = kernel->agent;
2260   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2261              kernel->description->vpgr_count);
2262
2263   /* Reduce the number of threads/workers if there are insufficient
2264      VGPRs available to run the kernels together.  */
2265   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2266     {
2267       int max_vgprs = max_isa_vgprs (agent->device_isa);
2268       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2269       int max_threads = (max_vgprs / granulated_vgprs) * 4;
2270       if (kla->gdims[2] > max_threads)
2271         {
2272           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2273                        " per team/gang - reducing to %d threads/workers.\n",
2274                        kla->gdims[2], max_threads);
2275           kla->gdims[2] = max_threads;
2276         }
2277     }
2278
2279   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2280              (aq ? aq->id : 0));
2281   GCN_DEBUG ("GCN launch attribs: gdims:[");
2282   int i;
2283   for (i = 0; i < kla->ndim; ++i)
2284     {
2285       if (i)
2286         DEBUG_PRINT (", ");
2287       DEBUG_PRINT ("%u", kla->gdims[i]);
2288     }
2289   DEBUG_PRINT ("], normalized gdims:[");
2290   for (i = 0; i < kla->ndim; ++i)
2291     {
2292       if (i)
2293         DEBUG_PRINT (", ");
2294       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2295     }
2296   DEBUG_PRINT ("], wdims:[");
2297   for (i = 0; i < kla->ndim; ++i)
2298     {
2299       if (i)
2300         DEBUG_PRINT (", ");
2301       DEBUG_PRINT ("%u", kla->wdims[i]);
2302     }
2303   DEBUG_PRINT ("]\n");
2304   DEBUG_FLUSH ();
2305
2306   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2307     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2308
2309   if (!agent->initialized)
2310     GOMP_PLUGIN_fatal ("Agent must be initialized");
2311
2312   if (!kernel->initialized)
2313     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2314
2315   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2316
2317   uint64_t index
2318     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2319   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2320
2321   /* Wait until the queue is not full before writing the packet.   */
2322   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2323          >= command_q->size)
2324     ;
2325
2326   /* Do not allow the dimensions to be overridden when running
2327      constructors or destructors.  */
2328   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2329   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2330
2331   hsa_kernel_dispatch_packet_t *packet;
2332   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2333            + index % command_q->size;
2334
2335   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2336   packet->grid_size_x = override_x ? : kla->gdims[0];
2337   packet->workgroup_size_x = get_group_size (kla->ndim,
2338                                              packet->grid_size_x,
2339                                              kla->wdims[0]);
2340
2341   if (kla->ndim >= 2)
2342     {
2343       packet->grid_size_y = kla->gdims[1];
2344       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2345                                                  kla->wdims[1]);
2346     }
2347   else
2348     {
2349       packet->grid_size_y = 1;
2350       packet->workgroup_size_y = 1;
2351     }
2352
2353   if (kla->ndim == 3)
2354     {
2355       packet->grid_size_z = limit_worker_threads (override_z
2356                                                   ? : kla->gdims[2]);
2357       packet->workgroup_size_z = get_group_size (kla->ndim,
2358                                                  packet->grid_size_z,
2359                                                  kla->wdims[2]);
2360     }
2361   else
2362     {
2363       packet->grid_size_z = 1;
2364       packet->workgroup_size_z = 1;
2365     }
2366
2367   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2368              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2369              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2370              packet->grid_size_x / packet->workgroup_size_x,
2371              packet->grid_size_y / packet->workgroup_size_y,
2372              packet->grid_size_z / packet->workgroup_size_z,
2373              packet->workgroup_size_x, packet->workgroup_size_y,
2374              packet->workgroup_size_z);
2375
2376   struct kernel_dispatch *shadow
2377     = create_kernel_dispatch (kernel, packet->grid_size_x,
2378                               packet->grid_size_z);
2379   shadow->queue = command_q;
2380
2381   if (debug)
2382     {
2383       fprintf (stderr, "\nKernel has following dependencies:\n");
2384       print_kernel_dispatch (shadow, 2);
2385     }
2386
2387   packet->private_segment_size = shadow->private_segment_size;
2388   packet->group_segment_size = shadow->group_segment_size;
2389   packet->kernel_object = shadow->object;
2390   packet->kernarg_address = shadow->kernarg_address;
2391   hsa_signal_t s;
2392   s.handle = shadow->signal;
2393   packet->completion_signal = s;
2394   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2395   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2396
2397   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2398
2399   uint16_t header;
2400   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2401   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2402   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2403
2404   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2405              agent->device_id);
2406
2407   packet_store_release ((uint32_t *) packet, header,
2408                         (uint16_t) kla->ndim
2409                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2410
2411   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2412                                        index);
2413
2414   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2415
2416   /* Root signal waits with 1ms timeout.  */
2417   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2418                                              1000 * 1000,
2419                                              HSA_WAIT_STATE_BLOCKED) != 0)
2420     {
2421       console_output (kernel, shadow->kernarg_address, false);
2422     }
2423   console_output (kernel, shadow->kernarg_address, true);
2424
2425   struct kernargs *kernargs = shadow->kernarg_address;
2426   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2427
2428   release_kernel_dispatch (shadow);
2429
2430   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2431     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2432
2433   unsigned int upper = (return_value & ~0xffff) >> 16;
2434   if (upper == 0xcafe)
2435     ; // exit not called, normal termination.
2436   else if (upper == 0xffff)
2437     ; // exit called.
2438   else
2439     {
2440       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2441                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2442                          return_value);
2443       abort ();
2444     }
2445
2446   if (upper == 0xffff)
2447     {
2448       unsigned int signal = (return_value >> 8) & 0xff;
2449
2450       if (signal == SIGABRT)
2451         {
2452           GCN_WARNING ("GCN Kernel aborted\n");
2453           abort ();
2454         }
2455       else if (signal != 0)
2456         {
2457           GCN_WARNING ("GCN Kernel received unknown signal\n");
2458           abort ();
2459         }
2460
2461       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2462       exit (return_value & 0xff);
2463     }
2464 }
2465
2466 /* }}}  */
2467 /* {{{ Load/Unload  */
2468
2469 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2470
2471 static bool
2472 init_basic_kernel_info (struct kernel_info *kernel,
2473                         struct hsa_kernel_description *d,
2474                         struct agent_info *agent,
2475                         struct module_info *module)
2476 {
2477   kernel->agent = agent;
2478   kernel->module = module;
2479   kernel->name = d->name;
2480   kernel->description = d;
2481   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2482     {
2483       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2484       return false;
2485     }
2486   return true;
2487 }
2488
2489 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2490
2491 static bool
2492 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2493 {
2494   int isa_field = elf_gcn_isa_field (image);
2495   const char* isa_s = isa_hsa_name (isa_field);
2496   if (!isa_s)
2497     {
2498       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2499       return false;
2500     }
2501
2502   if (isa_field != agent->device_isa)
2503     {
2504       char msg[120];
2505       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2506       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2507       assert (agent_isa_s);
2508       assert (agent_isa_gcc_s);
2509
2510       snprintf (msg, sizeof msg,
2511                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2512                 "Try to recompile with '-foffload-options=-march=%s'.\n",
2513                 isa_s, agent_isa_s, agent_isa_gcc_s);
2514
2515       hsa_error (msg, HSA_STATUS_ERROR);
2516       return false;
2517     }
2518
2519   return true;
2520 }
2521
2522 /* Create and finalize the program consisting of all loaded modules.  */
2523
2524 static bool
2525 create_and_finalize_hsa_program (struct agent_info *agent)
2526 {
2527   hsa_status_t status;
2528   bool res = true;
2529   if (pthread_mutex_lock (&agent->prog_mutex))
2530     {
2531       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2532       return false;
2533     }
2534   if (agent->prog_finalized)
2535     goto final;
2536
2537   status
2538     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2539                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2540                                         "", &agent->executable);
2541   if (status != HSA_STATUS_SUCCESS)
2542     {
2543       hsa_error ("Could not create GCN executable", status);
2544       goto fail;
2545     }
2546
2547   /* Load any GCN modules.  */
2548   struct module_info *module = agent->module;
2549   if (module)
2550     {
2551       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2552
2553       if (!isa_matches_agent (agent, image))
2554         goto fail;
2555
2556       hsa_code_object_t co = { 0 };
2557       status = hsa_fns.hsa_code_object_deserialize_fn
2558         (module->image_desc->gcn_image->image,
2559          module->image_desc->gcn_image->size,
2560          NULL, &co);
2561       if (status != HSA_STATUS_SUCCESS)
2562         {
2563           hsa_error ("Could not deserialize GCN code object", status);
2564           goto fail;
2565         }
2566
2567       status = hsa_fns.hsa_executable_load_code_object_fn
2568         (agent->executable, agent->id, co, "");
2569       if (status != HSA_STATUS_SUCCESS)
2570         {
2571           hsa_error ("Could not load GCN code object", status);
2572           goto fail;
2573         }
2574
2575       if (!module->heap)
2576         {
2577           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2578                                                    gcn_kernel_heap_size,
2579                                                    (void**)&module->heap);
2580           if (status != HSA_STATUS_SUCCESS)
2581             {
2582               hsa_error ("Could not allocate memory for GCN heap", status);
2583               goto fail;
2584             }
2585
2586           status = hsa_fns.hsa_memory_assign_agent_fn
2587                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2588           if (status != HSA_STATUS_SUCCESS)
2589             {
2590               hsa_error ("Could not assign GCN heap memory to device", status);
2591               goto fail;
2592             }
2593
2594           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2595                                       &gcn_kernel_heap_size,
2596                                       sizeof (gcn_kernel_heap_size));
2597         }
2598
2599     }
2600
2601   if (debug)
2602     dump_executable_symbols (agent->executable);
2603
2604   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2605   if (status != HSA_STATUS_SUCCESS)
2606     {
2607       hsa_error ("Could not freeze the GCN executable", status);
2608       goto fail;
2609     }
2610
2611 final:
2612   agent->prog_finalized = true;
2613
2614   if (pthread_mutex_unlock (&agent->prog_mutex))
2615     {
2616       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2617       res = false;
2618     }
2619
2620   return res;
2621
2622 fail:
2623   res = false;
2624   goto final;
2625 }
2626
2627 /* Free the HSA program in agent and everything associated with it and set
2628    agent->prog_finalized and the initialized flags of all kernels to false.
2629    Return TRUE on success.  */
2630
2631 static bool
2632 destroy_hsa_program (struct agent_info *agent)
2633 {
2634   if (!agent->prog_finalized)
2635     return true;
2636
2637   hsa_status_t status;
2638
2639   GCN_DEBUG ("Destroying the current GCN program.\n");
2640
2641   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2642   if (status != HSA_STATUS_SUCCESS)
2643     return hsa_error ("Could not destroy GCN executable", status);
2644
2645   if (agent->module)
2646     {
2647       int i;
2648       for (i = 0; i < agent->module->kernel_count; i++)
2649         agent->module->kernels[i].initialized = false;
2650
2651       if (agent->module->heap)
2652         {
2653           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2654           agent->module->heap = NULL;
2655         }
2656     }
2657   agent->prog_finalized = false;
2658   return true;
2659 }
2660
2661 /* Deinitialize all information associated with MODULE and kernels within
2662    it.  Return TRUE on success.  */
2663
2664 static bool
2665 destroy_module (struct module_info *module, bool locked)
2666 {
2667   /* Run destructors before destroying module.  */
2668   struct GOMP_kernel_launch_attributes kla =
2669     { 3,
2670       /* Grid size.  */
2671       { 1, 64, 1 },
2672       /* Work-group size.  */
2673       { 1, 64, 1 }
2674     };
2675
2676   if (module->fini_array_func)
2677     {
2678       init_kernel (module->fini_array_func);
2679       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2680     }
2681   module->constructors_run_p = false;
2682
2683   int i;
2684   for (i = 0; i < module->kernel_count; i++)
2685     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2686       {
2687         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2688                            "mutex");
2689         return false;
2690       }
2691
2692   return true;
2693 }
2694
2695 /* }}}  */
2696 /* {{{ Async  */
2697
2698 /* Callback of dispatch queues to report errors.  */
2699
2700 static void
2701 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2702 {
2703   struct queue_entry *entry = &aq->queue[index];
2704
2705   switch (entry->type)
2706     {
2707     case KERNEL_LAUNCH:
2708       if (DEBUG_QUEUES)
2709         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2710                    aq->agent->device_id, aq->id, index);
2711       run_kernel (entry->u.launch.kernel,
2712                   entry->u.launch.vars,
2713                   &entry->u.launch.kla, aq, false);
2714       if (DEBUG_QUEUES)
2715         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2716                    aq->agent->device_id, aq->id, index);
2717       break;
2718
2719     case CALLBACK:
2720       if (DEBUG_QUEUES)
2721         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2722                    aq->agent->device_id, aq->id, index);
2723       entry->u.callback.fn (entry->u.callback.data);
2724       if (DEBUG_QUEUES)
2725         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2726                    aq->agent->device_id, aq->id, index);
2727       break;
2728
2729     case ASYNC_WAIT:
2730       {
2731         /* FIXME: is it safe to access a placeholder that may already have
2732            been executed?  */
2733         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2734
2735         if (DEBUG_QUEUES)
2736           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2737                      aq->agent->device_id, aq->id, index);
2738
2739         pthread_mutex_lock (&placeholderp->mutex);
2740
2741         while (!placeholderp->executed)
2742           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2743
2744         pthread_mutex_unlock (&placeholderp->mutex);
2745
2746         if (pthread_cond_destroy (&placeholderp->cond))
2747           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2748
2749         if (pthread_mutex_destroy (&placeholderp->mutex))
2750           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2751
2752         if (DEBUG_QUEUES)
2753           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2754                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2755       }
2756       break;
2757
2758     case ASYNC_PLACEHOLDER:
2759       pthread_mutex_lock (&entry->u.placeholder.mutex);
2760       entry->u.placeholder.executed = 1;
2761       pthread_cond_signal (&entry->u.placeholder.cond);
2762       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2763       break;
2764
2765     default:
2766       GOMP_PLUGIN_fatal ("Unknown queue element");
2767     }
2768 }
2769
2770 /* This function is run as a thread to service an async queue in the
2771    background.  It runs continuously until the stop flag is set.  */
2772
2773 static void *
2774 drain_queue (void *thread_arg)
2775 {
2776   struct goacc_asyncqueue *aq = thread_arg;
2777
2778   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2779     {
2780       aq->drain_queue_stop = 2;
2781       return NULL;
2782     }
2783
2784   pthread_mutex_lock (&aq->mutex);
2785
2786   while (true)
2787     {
2788       if (aq->drain_queue_stop)
2789         break;
2790
2791       if (aq->queue_n > 0)
2792         {
2793           pthread_mutex_unlock (&aq->mutex);
2794           execute_queue_entry (aq, aq->queue_first);
2795
2796           pthread_mutex_lock (&aq->mutex);
2797           aq->queue_first = ((aq->queue_first + 1)
2798                              % ASYNC_QUEUE_SIZE);
2799           aq->queue_n--;
2800
2801           if (DEBUG_THREAD_SIGNAL)
2802             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2803                        aq->agent->device_id, aq->id);
2804           pthread_cond_broadcast (&aq->queue_cond_out);
2805           pthread_mutex_unlock (&aq->mutex);
2806
2807           if (DEBUG_QUEUES)
2808             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2809                        aq->id);
2810           pthread_mutex_lock (&aq->mutex);
2811         }
2812       else
2813         {
2814           if (DEBUG_THREAD_SLEEP)
2815             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2816                        aq->agent->device_id, aq->id);
2817           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2818           if (DEBUG_THREAD_SLEEP)
2819             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2820                        aq->agent->device_id, aq->id);
2821         }
2822     }
2823
2824   aq->drain_queue_stop = 2;
2825   if (DEBUG_THREAD_SIGNAL)
2826     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2827                aq->agent->device_id, aq->id);
2828   pthread_cond_broadcast (&aq->queue_cond_out);
2829   pthread_mutex_unlock (&aq->mutex);
2830
2831   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2832   return NULL;
2833 }
2834
2835 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2836    is not usually the case.  This is just a debug tool.  */
2837
2838 static void
2839 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2840 {
2841   pthread_mutex_lock (&aq->mutex);
2842
2843   while (aq->queue_n > 0)
2844     {
2845       execute_queue_entry (aq, aq->queue_first);
2846
2847       aq->queue_first = ((aq->queue_first + 1)
2848                          % ASYNC_QUEUE_SIZE);
2849       aq->queue_n--;
2850     }
2851
2852   pthread_mutex_unlock (&aq->mutex);
2853 }
2854
2855 /* Block the current thread until an async queue is writable.  The aq->mutex
2856    lock should be held on entry, and remains locked on exit.  */
2857
2858 static void
2859 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2860 {
2861   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2862     {
2863       /* Queue is full.  Wait for it to not be full.  */
2864       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2865         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2866     }
2867 }
2868
2869 /* Request an asynchronous kernel launch on the specified queue.  This
2870    may block if the queue is full, but returns without waiting for the
2871    kernel to run.  */
2872
2873 static void
2874 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2875                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2876 {
2877   assert (aq->agent == kernel->agent);
2878
2879   pthread_mutex_lock (&aq->mutex);
2880
2881   wait_for_queue_nonfull (aq);
2882
2883   int queue_last = ((aq->queue_first + aq->queue_n)
2884                     % ASYNC_QUEUE_SIZE);
2885   if (DEBUG_QUEUES)
2886     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2887                aq->id, queue_last);
2888
2889   aq->queue[queue_last].type = KERNEL_LAUNCH;
2890   aq->queue[queue_last].u.launch.kernel = kernel;
2891   aq->queue[queue_last].u.launch.vars = vars;
2892   aq->queue[queue_last].u.launch.kla = *kla;
2893
2894   aq->queue_n++;
2895
2896   if (DEBUG_THREAD_SIGNAL)
2897     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2898                aq->agent->device_id, aq->id);
2899   pthread_cond_signal (&aq->queue_cond_in);
2900
2901   pthread_mutex_unlock (&aq->mutex);
2902 }
2903
2904 /* Request an asynchronous callback on the specified queue.  The callback
2905    function will be called, with the given opaque data, from the appropriate
2906    async thread, when all previous items on that queue are complete.  */
2907
2908 static void
2909 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2910                      void *data)
2911 {
2912   pthread_mutex_lock (&aq->mutex);
2913
2914   wait_for_queue_nonfull (aq);
2915
2916   int queue_last = ((aq->queue_first + aq->queue_n)
2917                     % ASYNC_QUEUE_SIZE);
2918   if (DEBUG_QUEUES)
2919     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2920                aq->id, queue_last);
2921
2922   aq->queue[queue_last].type = CALLBACK;
2923   aq->queue[queue_last].u.callback.fn = fn;
2924   aq->queue[queue_last].u.callback.data = data;
2925
2926   aq->queue_n++;
2927
2928   if (DEBUG_THREAD_SIGNAL)
2929     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2930                aq->agent->device_id, aq->id);
2931   pthread_cond_signal (&aq->queue_cond_in);
2932
2933   pthread_mutex_unlock (&aq->mutex);
2934 }
2935
2936 /* Request that a given async thread wait for another thread (unspecified) to
2937    reach the given placeholder.  The wait will occur when all previous entries
2938    on the queue are complete.  A placeholder is effectively a kind of signal
2939    which simply sets a flag when encountered in a queue.  */
2940
2941 static void
2942 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2943                       struct placeholder *placeholderp)
2944 {
2945   pthread_mutex_lock (&aq->mutex);
2946
2947   wait_for_queue_nonfull (aq);
2948
2949   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2950   if (DEBUG_QUEUES)
2951     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2952                aq->id, queue_last);
2953
2954   aq->queue[queue_last].type = ASYNC_WAIT;
2955   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2956
2957   aq->queue_n++;
2958
2959   if (DEBUG_THREAD_SIGNAL)
2960     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2961                aq->agent->device_id, aq->id);
2962   pthread_cond_signal (&aq->queue_cond_in);
2963
2964   pthread_mutex_unlock (&aq->mutex);
2965 }
2966
2967 /* Add a placeholder into an async queue.  When the async thread reaches the
2968    placeholder it will set the "executed" flag to true and continue.
2969    Another thread may be waiting on this thread reaching the placeholder.  */
2970
2971 static struct placeholder *
2972 queue_push_placeholder (struct goacc_asyncqueue *aq)
2973 {
2974   struct placeholder *placeholderp;
2975
2976   pthread_mutex_lock (&aq->mutex);
2977
2978   wait_for_queue_nonfull (aq);
2979
2980   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2981   if (DEBUG_QUEUES)
2982     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2983                aq->id, queue_last);
2984
2985   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2986   placeholderp = &aq->queue[queue_last].u.placeholder;
2987
2988   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2989     {
2990       pthread_mutex_unlock (&aq->mutex);
2991       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2992     }
2993
2994   if (pthread_cond_init (&placeholderp->cond, NULL))
2995     {
2996       pthread_mutex_unlock (&aq->mutex);
2997       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2998     }
2999
3000   placeholderp->executed = 0;
3001
3002   aq->queue_n++;
3003
3004   if (DEBUG_THREAD_SIGNAL)
3005     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3006                aq->agent->device_id, aq->id);
3007   pthread_cond_signal (&aq->queue_cond_in);
3008
3009   pthread_mutex_unlock (&aq->mutex);
3010
3011   return placeholderp;
3012 }
3013
3014 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
3015
3016 static void
3017 finalize_async_thread (struct goacc_asyncqueue *aq)
3018 {
3019   pthread_mutex_lock (&aq->mutex);
3020   if (aq->drain_queue_stop == 2)
3021     {
3022       pthread_mutex_unlock (&aq->mutex);
3023       return;
3024     }
3025
3026   aq->drain_queue_stop = 1;
3027
3028   if (DEBUG_THREAD_SIGNAL)
3029     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3030                aq->agent->device_id, aq->id);
3031   pthread_cond_signal (&aq->queue_cond_in);
3032
3033   while (aq->drain_queue_stop != 2)
3034     {
3035       if (DEBUG_THREAD_SLEEP)
3036         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3037                    " to sleep\n", aq->agent->device_id, aq->id);
3038       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3039       if (DEBUG_THREAD_SLEEP)
3040         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
3041                    aq->agent->device_id, aq->id);
3042     }
3043
3044   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3045              aq->id);
3046   pthread_mutex_unlock (&aq->mutex);
3047
3048   int err = pthread_join (aq->thread_drain_queue, NULL);
3049   if (err != 0)
3050     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3051                        aq->agent->device_id, aq->id, strerror (err));
3052   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3053 }
3054
3055 /* Set up an async queue for OpenMP.  There will be only one.  The
3056    implementation simply uses an OpenACC async queue.
3057    FIXME: is this thread-safe if two threads call this function?  */
3058
3059 static void
3060 maybe_init_omp_async (struct agent_info *agent)
3061 {
3062   if (!agent->omp_async_queue)
3063     agent->omp_async_queue
3064       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3065 }
3066
3067 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3068    copies from read-only pages.  */
3069
3070 static void
3071 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3072 {
3073   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3074
3075   if (status == HSA_STATUS_SUCCESS)
3076     return;
3077
3078   /* It appears that the copy fails if the source data is in a read-only page.
3079      We can't detect that easily, so try copying the data to a temporary buffer
3080      and doing the copy again if we got an error above.  */
3081
3082   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3083                "[%p:+%d]\n", (void *) src, (int) len);
3084
3085   void *src_copy = malloc (len);
3086   memcpy (src_copy, src, len);
3087   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3088   free (src_copy);
3089   if (status != HSA_STATUS_SUCCESS)
3090     GOMP_PLUGIN_error ("memory copy failed");
3091 }
3092
3093 /* Copy data to or from a device.  This is intended for use as an async
3094    callback event.  */
3095
3096 static void
3097 copy_data (void *data_)
3098 {
3099   struct copy_data *data = (struct copy_data *)data_;
3100   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3101              data->aq->agent->device_id, data->aq->id, data->len, data->src,
3102              data->dst);
3103   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3104   free (data);
3105 }
3106
3107 /* Request an asynchronous data copy, to or from a device, on a given queue.
3108    The event will be registered as a callback.  */
3109
3110 static void
3111 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3112                  size_t len)
3113 {
3114   if (DEBUG_QUEUES)
3115     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3116                aq->agent->device_id, aq->id, len, src, dst);
3117   struct copy_data *data
3118     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3119   data->dst = dst;
3120   data->src = src;
3121   data->len = len;
3122   data->aq = aq;
3123   queue_push_callback (aq, copy_data, data);
3124 }
3125
3126 /* Return true if the given queue is currently empty.  */
3127
3128 static int
3129 queue_empty (struct goacc_asyncqueue *aq)
3130 {
3131   pthread_mutex_lock (&aq->mutex);
3132   int res = aq->queue_n == 0 ? 1 : 0;
3133   pthread_mutex_unlock (&aq->mutex);
3134
3135   return res;
3136 }
3137
3138 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3139    directive.  */
3140
3141 static void
3142 wait_queue (struct goacc_asyncqueue *aq)
3143 {
3144   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3145     {
3146       drain_queue_synchronous (aq);
3147       return;
3148     }
3149
3150   pthread_mutex_lock (&aq->mutex);
3151
3152   while (aq->queue_n > 0)
3153     {
3154       if (DEBUG_THREAD_SLEEP)
3155         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3156                    aq->agent->device_id, aq->id);
3157       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3158       if (DEBUG_THREAD_SLEEP)
3159         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3160                    aq->id);
3161     }
3162
3163   pthread_mutex_unlock (&aq->mutex);
3164   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3165 }
3166
3167 /* }}}  */
3168 /* {{{ OpenACC support  */
3169
3170 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3171
3172 static void
3173 gcn_exec (struct kernel_info *kernel,
3174           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3175           struct goacc_asyncqueue *aq)
3176 {
3177   if (!GOMP_OFFLOAD_can_run (kernel))
3178     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3179
3180   /* If we get here then this must be an OpenACC kernel.  */
3181   kernel->kind = KIND_OPENACC;
3182
3183   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3184   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3185     {
3186       struct hsa_kernel_description *d
3187         = &kernel->module->image_desc->kernel_infos[i];
3188       if (d->name == kernel->name)
3189         {
3190           hsa_kernel_desc = d;
3191           break;
3192         }
3193     }
3194
3195   /* We may have statically-determined dimensions in
3196      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3197      invocation at runtime in dims[].  We allow static dimensions to take
3198      priority over dynamic dimensions when present (non-zero).  */
3199   if (hsa_kernel_desc->oacc_dims[0] > 0)
3200     dims[0] = hsa_kernel_desc->oacc_dims[0];
3201   if (hsa_kernel_desc->oacc_dims[1] > 0)
3202     dims[1] = hsa_kernel_desc->oacc_dims[1];
3203   if (hsa_kernel_desc->oacc_dims[2] > 0)
3204     dims[2] = hsa_kernel_desc->oacc_dims[2];
3205
3206   /* Ideally, when a dimension isn't explicitly specified, we should
3207      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3208      In practice, we tune for peak performance on BabelStream, which
3209      for OpenACC is currently 32 threads per CU.  */
3210   if (dims[0] == 0 && dims[1] == 0)
3211     {
3212       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3213          number.  There isn't really a correct answer for this without a clue
3214          about the problem size, so let's do a reasonable number of workers
3215          and gangs.  */
3216
3217       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3218       dims[1] = 8; /* Workers.  */
3219     }
3220   else if (dims[0] == 0 && dims[1] > 0)
3221     {
3222       /* Auto-scale the number of gangs with the requested number of workers.  */
3223       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3224     }
3225   else if (dims[0] > 0 && dims[1] == 0)
3226     {
3227       /* Auto-scale the number of workers with the requested number of gangs.  */
3228       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3229       if (dims[1] == 0)
3230         dims[1] = 1;
3231       if (dims[1] > 16)
3232         dims[1] = 16;
3233     }
3234
3235   /* The incoming dimensions are expressed in terms of gangs, workers, and
3236      vectors.  The HSA dimensions are expressed in terms of "work-items",
3237      which means multiples of vector lanes.
3238
3239      The "grid size" specifies the size of the problem space, and the
3240      "work-group size" specifies how much of that we want a single compute
3241      unit to chew on at once.
3242
3243      The three dimensions do not really correspond to hardware, but the
3244      important thing is that the HSA runtime will launch as many
3245      work-groups as it takes to process the entire grid, and each
3246      work-group will contain as many wave-fronts as it takes to process
3247      the work-items in that group.
3248
3249      Essentially, as long as we set the Y dimension to 64 (the number of
3250      vector lanes in hardware), and the Z group size to the maximum (16),
3251      then we will get the gangs (X) and workers (Z) launched as we expect.
3252
3253      The reason for the apparent reversal of vector and worker dimension
3254      order is to do with the way the run-time distributes work-items across
3255      v1 and v2.  */
3256   struct GOMP_kernel_launch_attributes kla =
3257     {3,
3258      /* Grid size.  */
3259      {dims[0], 64, dims[1]},
3260      /* Work-group size.  */
3261      {1,       64, 16}
3262     };
3263
3264   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3265   acc_prof_info *prof_info = thr->prof_info;
3266   acc_event_info enqueue_launch_event_info;
3267   acc_api_info *api_info = thr->api_info;
3268   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3269   if (profiling_dispatch_p)
3270     {
3271       prof_info->event_type = acc_ev_enqueue_launch_start;
3272
3273       enqueue_launch_event_info.launch_event.event_type
3274         = prof_info->event_type;
3275       enqueue_launch_event_info.launch_event.valid_bytes
3276         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3277       enqueue_launch_event_info.launch_event.parent_construct
3278         = acc_construct_parallel;
3279       enqueue_launch_event_info.launch_event.implicit = 1;
3280       enqueue_launch_event_info.launch_event.tool_info = NULL;
3281       enqueue_launch_event_info.launch_event.kernel_name
3282         = (char *) kernel->name;
3283       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3284       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3285       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3286
3287       api_info->device_api = acc_device_api_other;
3288
3289       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3290         &enqueue_launch_event_info, api_info);
3291     }
3292
3293   if (!async)
3294     run_kernel (kernel, devaddrs, &kla, NULL, false);
3295   else
3296     queue_push_launch (aq, kernel, devaddrs, &kla);
3297
3298   if (profiling_dispatch_p)
3299     {
3300       prof_info->event_type = acc_ev_enqueue_launch_end;
3301       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3302       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3303                                             &enqueue_launch_event_info,
3304                                             api_info);
3305     }
3306 }
3307
3308 /* }}}  */
3309 /* {{{ Generic Plugin API  */
3310
3311 /* Return the name of the accelerator, which is "gcn".  */
3312
3313 const char *
3314 GOMP_OFFLOAD_get_name (void)
3315 {
3316   return "gcn";
3317 }
3318
3319 /* Return the specific capabilities the HSA accelerator have.  */
3320
3321 unsigned int
3322 GOMP_OFFLOAD_get_caps (void)
3323 {
3324   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3325   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3326             | GOMP_OFFLOAD_CAP_OPENACC_200;
3327 }
3328
3329 /* Identify as GCN accelerator.  */
3330
3331 int
3332 GOMP_OFFLOAD_get_type (void)
3333 {
3334   return OFFLOAD_TARGET_TYPE_GCN;
3335 }
3336
3337 /* Return the libgomp version number we're compatible with.  There is
3338    no requirement for cross-version compatibility.  */
3339
3340 unsigned
3341 GOMP_OFFLOAD_version (void)
3342 {
3343   return GOMP_VERSION;
3344 }
3345
3346 /* Return the number of GCN devices on the system.  */
3347
3348 int
3349 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3350 {
3351   if (!init_hsa_context (true))
3352     exit (EXIT_FAILURE);
3353   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3354      devices were present.  */
3355   if (hsa_context.agent_count > 0
3356       && ((omp_requires_mask
3357            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3358                | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3359                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3360     return -1;
3361   /* Check whether host page access is supported; this is per system level
3362      (all GPUs supported by HSA).  While intrinsically true for APUs, it
3363      requires XNACK support for discrete GPUs.  */
3364   if (hsa_context.agent_count > 0
3365       && (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY))
3366     {
3367       bool b;
3368       hsa_system_info_t type = HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT;
3369       hsa_status_t status = hsa_fns.hsa_system_get_info_fn (type, &b);
3370       if (status != HSA_STATUS_SUCCESS)
3371         GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3372                            "failed");
3373       if (!b)
3374         return -1;
3375     }
3376
3377   return hsa_context.agent_count;
3378 }
3379
3380 /* Initialize device (agent) number N so that it can be used for computation.
3381    Return TRUE on success.  */
3382
3383 bool
3384 GOMP_OFFLOAD_init_device (int n)
3385 {
3386   if (!init_hsa_context (false))
3387     return false;
3388   if (n >= hsa_context.agent_count)
3389     {
3390       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3391       return false;
3392     }
3393   struct agent_info *agent = &hsa_context.agents[n];
3394
3395   if (agent->initialized)
3396     return true;
3397
3398   agent->device_id = n;
3399
3400   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3401     {
3402       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3403       return false;
3404     }
3405   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3406     {
3407       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3408       return false;
3409     }
3410   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3411     {
3412       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3413       return false;
3414     }
3415   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3416     {
3417       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3418       return false;
3419     }
3420   agent->async_queues = NULL;
3421   agent->omp_async_queue = NULL;
3422   agent->ephemeral_memories_list = NULL;
3423
3424   uint32_t queue_size;
3425   hsa_status_t status;
3426   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3427                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3428                                           &queue_size);
3429   if (status != HSA_STATUS_SUCCESS)
3430     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3431                       status);
3432
3433   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3434                                           &agent->name);
3435   if (status != HSA_STATUS_SUCCESS)
3436     return hsa_error ("Error querying the name of the agent", status);
3437
3438   agent->device_isa = isa_code (agent->name);
3439   if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3440     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3441
3442   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3443                                           &agent->vendor_name);
3444   if (status != HSA_STATUS_SUCCESS)
3445     return hsa_error ("Error querying the vendor name of the agent", status);
3446
3447   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3448                                         HSA_QUEUE_TYPE_MULTI,
3449                                         hsa_queue_callback, NULL, UINT32_MAX,
3450                                         UINT32_MAX, &agent->sync_queue);
3451   if (status != HSA_STATUS_SUCCESS)
3452     return hsa_error ("Error creating command queue", status);
3453
3454   agent->kernarg_region.handle = (uint64_t) -1;
3455   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3456                                                  get_kernarg_memory_region,
3457                                                  &agent->kernarg_region);
3458   if (status != HSA_STATUS_SUCCESS
3459       && status != HSA_STATUS_INFO_BREAK)
3460     hsa_error ("Scanning memory regions failed", status);
3461   if (agent->kernarg_region.handle == (uint64_t) -1)
3462     {
3463       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3464                          "arguments");
3465       return false;
3466     }
3467   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3468   dump_hsa_region (agent->kernarg_region, NULL);
3469
3470   agent->data_region.handle = (uint64_t) -1;
3471   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3472                                                  get_data_memory_region,
3473                                                  &agent->data_region);
3474   if (status != HSA_STATUS_SUCCESS
3475       && status != HSA_STATUS_INFO_BREAK)
3476     hsa_error ("Scanning memory regions failed", status);
3477   if (agent->data_region.handle == (uint64_t) -1)
3478     {
3479       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3480                          "data");
3481       return false;
3482     }
3483   GCN_DEBUG ("Selected device data memory region:\n");
3484   dump_hsa_region (agent->data_region, NULL);
3485
3486   GCN_DEBUG ("GCN agent %d initialized\n", n);
3487
3488   agent->initialized = true;
3489   return true;
3490 }
3491
3492 /* Load GCN object-code module described by struct gcn_image_desc in
3493    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3494    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3495    contain the on-device addresses of the functions for reverse offload.  To be
3496    freed by the caller.  */
3497
3498 int
3499 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3500                          struct addr_pair **target_table,
3501                          uint64_t **rev_fn_table,
3502                          uint64_t *host_ind_fn_table)
3503 {
3504   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3505     {
3506       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3507                          " (expected %u, received %u)",
3508                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3509       return -1;
3510     }
3511
3512   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3513   struct agent_info *agent;
3514   struct addr_pair *pair;
3515   struct module_info *module;
3516   struct kernel_info *kernel;
3517   int kernel_count = image_desc->kernel_count;
3518   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3519                               ? image_desc->ind_func_count : 0;
3520   unsigned var_count = image_desc->global_variable_count;
3521   /* Currently, "others" is a struct of ICVS.  */
3522   int other_count = 1;
3523
3524   agent = get_agent_info (ord);
3525   if (!agent)
3526     return -1;
3527
3528   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3529     {
3530       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3531       return -1;
3532     }
3533   if (agent->prog_finalized
3534       && !destroy_hsa_program (agent))
3535     return -1;
3536
3537   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3538   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3539   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3540   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3541   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3542                              * sizeof (struct addr_pair));
3543   *target_table = pair;
3544   module = (struct module_info *)
3545     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3546                                 + kernel_count * sizeof (struct kernel_info));
3547   module->image_desc = image_desc;
3548   module->kernel_count = kernel_count;
3549   module->heap = NULL;
3550   module->constructors_run_p = false;
3551
3552   kernel = &module->kernels[0];
3553
3554   /* Allocate memory for kernel dependencies.  */
3555   for (unsigned i = 0; i < kernel_count; i++)
3556     {
3557       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3558       if (!init_basic_kernel_info (kernel, d, agent, module))
3559         return -1;
3560       if (strcmp (d->name, "_init_array") == 0)
3561         module->init_array_func = kernel;
3562       else if (strcmp (d->name, "_fini_array") == 0)
3563         module->fini_array_func = kernel;
3564       else
3565         {
3566           pair->start = (uintptr_t) kernel;
3567           pair->end = (uintptr_t) (kernel + 1);
3568           pair++;
3569         }
3570       kernel++;
3571     }
3572
3573   agent->module = module;
3574   if (pthread_rwlock_unlock (&agent->module_rwlock))
3575     {
3576       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3577       return -1;
3578     }
3579
3580   if (!create_and_finalize_hsa_program (agent))
3581     return -1;
3582
3583   if (var_count > 0)
3584     {
3585       hsa_status_t status;
3586       hsa_executable_symbol_t var_symbol;
3587       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3588                                                      ".offload_var_table",
3589                                                      agent->id,
3590                                                      0, &var_symbol);
3591
3592       if (status != HSA_STATUS_SUCCESS)
3593         hsa_fatal ("Could not find symbol for variable in the code object",
3594                    status);
3595
3596       uint64_t var_table_addr;
3597       status = hsa_fns.hsa_executable_symbol_get_info_fn
3598         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3599          &var_table_addr);
3600       if (status != HSA_STATUS_SUCCESS)
3601         hsa_fatal ("Could not extract a variable from its symbol", status);
3602
3603       struct {
3604         uint64_t addr;
3605         uint64_t size;
3606       } var_table[var_count];
3607       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3608                              (void*)var_table_addr, sizeof (var_table));
3609
3610       for (unsigned i = 0; i < var_count; i++)
3611         {
3612           pair->start = var_table[i].addr;
3613           pair->end = var_table[i].addr + var_table[i].size;
3614           GCN_DEBUG ("Found variable at %p with size %lu\n",
3615                      (void *)var_table[i].addr, var_table[i].size);
3616           pair++;
3617         }
3618     }
3619
3620   if (ind_func_count > 0)
3621     {
3622       hsa_status_t status;
3623
3624       /* Read indirect function table from image.  */
3625       hsa_executable_symbol_t ind_funcs_symbol;
3626       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3627                                                      ".offload_ind_func_table",
3628                                                      agent->id,
3629                                                      0, &ind_funcs_symbol);
3630
3631       if (status != HSA_STATUS_SUCCESS)
3632         hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3633                    "code object", status);
3634
3635       uint64_t ind_funcs_table_addr;
3636       status = hsa_fns.hsa_executable_symbol_get_info_fn
3637         (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3638          &ind_funcs_table_addr);
3639       if (status != HSA_STATUS_SUCCESS)
3640         hsa_fatal ("Could not extract a variable from its symbol", status);
3641
3642       uint64_t ind_funcs_table[ind_func_count];
3643       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3644                              (void*) ind_funcs_table_addr,
3645                              sizeof (ind_funcs_table));
3646
3647       /* Build host->target address map for indirect functions.  */
3648       uint64_t ind_fn_map[ind_func_count * 2 + 1];
3649       for (unsigned i = 0; i < ind_func_count; i++)
3650         {
3651           ind_fn_map[i * 2] = host_ind_fn_table[i];
3652           ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3653           GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3654                      i, host_ind_fn_table[i], ind_funcs_table[i]);
3655         }
3656       ind_fn_map[ind_func_count * 2] = 0;
3657
3658       /* Write the map onto the target.  */
3659       void *map_target_addr
3660         = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3661       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3662
3663       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3664                              (void*) ind_fn_map,
3665                              sizeof (ind_fn_map));
3666
3667       /* Write address of the map onto the target.  */
3668       hsa_executable_symbol_t symbol;
3669
3670       status
3671         = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3672                                                 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3673                                                 agent->id, 0, &symbol);
3674       if (status != HSA_STATUS_SUCCESS)
3675         hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3676                    status);
3677
3678       uint64_t varptr;
3679       uint32_t varsize;
3680
3681       status = hsa_fns.hsa_executable_symbol_get_info_fn
3682         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3683          &varptr);
3684       if (status != HSA_STATUS_SUCCESS)
3685         hsa_fatal ("Could not extract a variable from its symbol", status);
3686       status = hsa_fns.hsa_executable_symbol_get_info_fn
3687         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3688         &varsize);
3689       if (status != HSA_STATUS_SUCCESS)
3690         hsa_fatal ("Could not extract a variable size from its symbol",
3691                    status);
3692
3693       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3694                  varptr, varsize);
3695
3696       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3697                              &map_target_addr,
3698                              sizeof (map_target_addr));
3699     }
3700
3701   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3702
3703   hsa_status_t status;
3704   hsa_executable_symbol_t var_symbol;
3705   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3706                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3707                                                  agent->id, 0, &var_symbol);
3708   if (status == HSA_STATUS_SUCCESS)
3709     {
3710       uint64_t varptr;
3711       uint32_t varsize;
3712
3713       status = hsa_fns.hsa_executable_symbol_get_info_fn
3714         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3715          &varptr);
3716       if (status != HSA_STATUS_SUCCESS)
3717         hsa_fatal ("Could not extract a variable from its symbol", status);
3718       status = hsa_fns.hsa_executable_symbol_get_info_fn
3719         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3720          &varsize);
3721       if (status != HSA_STATUS_SUCCESS)
3722         hsa_fatal ("Could not extract a variable size from its symbol",
3723                    status);
3724
3725       pair->start = varptr;
3726       pair->end = varptr + varsize;
3727     }
3728   else
3729     {
3730       /* The variable was not in this image.  */
3731       GCN_DEBUG ("Variable not found in image: %s\n",
3732                  XSTRING (GOMP_ADDITIONAL_ICVS));
3733       pair->start = pair->end = 0;
3734     }
3735
3736   /* Ensure that constructors are run first.  */
3737   struct GOMP_kernel_launch_attributes kla =
3738     { 3,
3739       /* Grid size.  */
3740       { 1, 64, 1 },
3741       /* Work-group size.  */
3742       { 1, 64, 1 }
3743     };
3744
3745   if (module->init_array_func)
3746     {
3747       init_kernel (module->init_array_func);
3748       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3749     }
3750   module->constructors_run_p = true;
3751
3752   /* Don't report kernels that libgomp need not know about.  */
3753   if (module->init_array_func)
3754     kernel_count--;
3755   if (module->fini_array_func)
3756     kernel_count--;
3757
3758   if (rev_fn_table != NULL && kernel_count == 0)
3759     *rev_fn_table = NULL;
3760   else if (rev_fn_table != NULL)
3761     {
3762       hsa_status_t status;
3763       hsa_executable_symbol_t var_symbol;
3764       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3765                                                      ".offload_func_table",
3766                                                      agent->id, 0, &var_symbol);
3767       if (status != HSA_STATUS_SUCCESS)
3768         hsa_fatal ("Could not find symbol for variable in the code object",
3769                    status);
3770       uint64_t fn_table_addr;
3771       status = hsa_fns.hsa_executable_symbol_get_info_fn
3772         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3773          &fn_table_addr);
3774       if (status != HSA_STATUS_SUCCESS)
3775         hsa_fatal ("Could not extract a variable from its symbol", status);
3776       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3777       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3778                              (void*) fn_table_addr,
3779                              kernel_count * sizeof (uint64_t));
3780     }
3781
3782   return kernel_count + var_count + other_count;
3783 }
3784
3785 /* Unload GCN object-code module described by struct gcn_image_desc in
3786    TARGET_DATA from agent number N.  Return TRUE on success.  */
3787
3788 bool
3789 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3790 {
3791   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3792     {
3793       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3794                          " (expected %u, received %u)",
3795                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3796       return false;
3797     }
3798
3799   struct agent_info *agent;
3800   agent = get_agent_info (n);
3801   if (!agent)
3802     return false;
3803
3804   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3805     {
3806       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3807       return false;
3808     }
3809
3810   if (!agent->module || agent->module->image_desc != target_data)
3811     {
3812       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3813                          "loaded before");
3814       return false;
3815     }
3816
3817   if (!destroy_module (agent->module, true))
3818     return false;
3819   free (agent->module);
3820   agent->module = NULL;
3821   if (!destroy_hsa_program (agent))
3822     return false;
3823   if (pthread_rwlock_unlock (&agent->module_rwlock))
3824     {
3825       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3826       return false;
3827     }
3828   return true;
3829 }
3830
3831 /* Deinitialize all information and status associated with agent number N.  We
3832    do not attempt any synchronization, assuming the user and libgomp will not
3833    attempt deinitialization of a device that is in any way being used at the
3834    same time.  Return TRUE on success.  */
3835
3836 bool
3837 GOMP_OFFLOAD_fini_device (int n)
3838 {
3839   struct agent_info *agent = get_agent_info (n);
3840   if (!agent)
3841     return false;
3842
3843   if (!agent->initialized)
3844     return true;
3845
3846   if (agent->omp_async_queue)
3847     {
3848       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3849       agent->omp_async_queue = NULL;
3850     }
3851
3852   if (agent->module)
3853     {
3854       if (!destroy_module (agent->module, false))
3855         return false;
3856       free (agent->module);
3857       agent->module = NULL;
3858     }
3859
3860   if (!destroy_ephemeral_memories (agent))
3861     return false;
3862
3863   if (!destroy_hsa_program (agent))
3864     return false;
3865
3866   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3867   if (status != HSA_STATUS_SUCCESS)
3868     return hsa_error ("Error destroying command queue", status);
3869
3870   if (pthread_mutex_destroy (&agent->prog_mutex))
3871     {
3872       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3873       return false;
3874     }
3875   if (pthread_rwlock_destroy (&agent->module_rwlock))
3876     {
3877       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3878       return false;
3879     }
3880
3881   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3882     {
3883       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3884       return false;
3885     }
3886   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3887     {
3888       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3889       return false;
3890     }
3891   agent->initialized = false;
3892   return true;
3893 }
3894
3895 /* Return true if the HSA runtime can run function FN_PTR.  */
3896
3897 bool
3898 GOMP_OFFLOAD_can_run (void *fn_ptr)
3899 {
3900   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3901
3902   init_kernel (kernel);
3903   if (kernel->initialization_failed)
3904     GOMP_PLUGIN_fatal ("kernel initialization failed");
3905
3906   return true;
3907 }
3908
3909 /* Allocate memory on device N.  */
3910
3911 void *
3912 GOMP_OFFLOAD_alloc (int n, size_t size)
3913 {
3914   struct agent_info *agent = get_agent_info (n);
3915   return alloc_by_agent (agent, size);
3916 }
3917
3918 /* Free memory from device N.  */
3919
3920 bool
3921 GOMP_OFFLOAD_free (int device, void *ptr)
3922 {
3923   GCN_DEBUG ("Freeing memory on device %d\n", device);
3924
3925   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3926   if (status != HSA_STATUS_SUCCESS)
3927     {
3928       hsa_error ("Could not free device memory", status);
3929       return false;
3930     }
3931
3932   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3933   bool profiling_dispatch_p
3934     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3935   if (profiling_dispatch_p)
3936     {
3937       acc_prof_info *prof_info = thr->prof_info;
3938       acc_event_info data_event_info;
3939       acc_api_info *api_info = thr->api_info;
3940
3941       prof_info->event_type = acc_ev_free;
3942
3943       data_event_info.data_event.event_type = prof_info->event_type;
3944       data_event_info.data_event.valid_bytes
3945         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3946       data_event_info.data_event.parent_construct
3947         = acc_construct_parallel;
3948       data_event_info.data_event.implicit = 1;
3949       data_event_info.data_event.tool_info = NULL;
3950       data_event_info.data_event.var_name = NULL;
3951       data_event_info.data_event.bytes = 0;
3952       data_event_info.data_event.host_ptr = NULL;
3953       data_event_info.data_event.device_ptr = (void *) ptr;
3954
3955       api_info->device_api = acc_device_api_other;
3956
3957       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3958                                             api_info);
3959     }
3960
3961   return true;
3962 }
3963
3964 /* Copy data from DEVICE to host.  */
3965
3966 bool
3967 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3968 {
3969   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3970              src, dst);
3971   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3972   if (status != HSA_STATUS_SUCCESS)
3973     GOMP_PLUGIN_error ("memory copy failed");
3974   return true;
3975 }
3976
3977 /* Copy data from host to DEVICE.  */
3978
3979 bool
3980 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3981 {
3982   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3983              device, dst);
3984   hsa_memory_copy_wrapper (dst, src, n);
3985   return true;
3986 }
3987
3988 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3989
3990 bool
3991 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3992 {
3993   struct gcn_thread *thread_data = gcn_thread ();
3994
3995   if (thread_data && !async_synchronous_p (thread_data->async))
3996     {
3997       struct agent_info *agent = get_agent_info (device);
3998       maybe_init_omp_async (agent);
3999       queue_push_copy (agent->omp_async_queue, dst, src, n);
4000       return true;
4001     }
4002
4003   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
4004              device, src, device, dst);
4005   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
4006   if (status != HSA_STATUS_SUCCESS)
4007     GOMP_PLUGIN_error ("memory copy failed");
4008   return true;
4009 }
4010
4011 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
4012    measured in bytes.  So we have:
4013
4014    dim1_size: number of bytes to copy on innermost dimension ("row")
4015    dim0_len: number of rows to copy
4016    dst: base pointer for destination of copy
4017    dst_offset1_size: innermost row offset (for dest), in bytes
4018    dst_offset0_len: offset, number of rows (for dest)
4019    dst_dim1_size: whole-array dest row length, in bytes (pitch)
4020    src: base pointer for source of copy
4021    src_offset1_size: innermost row offset (for source), in bytes
4022    src_offset0_len: offset, number of rows (for source)
4023    src_dim1_size: whole-array source row length, in bytes (pitch)
4024 */
4025
4026 int
4027 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
4028                        size_t dim0_len, void *dst, size_t dst_offset1_size,
4029                        size_t dst_offset0_len, size_t dst_dim1_size,
4030                        const void *src, size_t src_offset1_size,
4031                        size_t src_offset0_len, size_t src_dim1_size)
4032 {
4033   if (!hsa_fns.hsa_amd_memory_lock_fn
4034       || !hsa_fns.hsa_amd_memory_unlock_fn
4035       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4036     return -1;
4037
4038   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
4039      out quietly if we have anything oddly-aligned rather than letting the
4040      driver raise an error.  */
4041   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4042     return -1;
4043
4044   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4045     return -1;
4046
4047   /* Only handle host to device or device to host transfers here.  */
4048   if ((dst_ord == -1 && src_ord == -1)
4049       || (dst_ord != -1 && src_ord != -1))
4050     return -1;
4051
4052   hsa_amd_copy_direction_t dir
4053     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4054   hsa_agent_t copy_agent;
4055
4056   /* We need to pin (lock) host memory before we start the transfer.  Try to
4057      lock the minimum size necessary, i.e. using partial first/last rows of the
4058      whole array.  Something like this:
4059
4060          rows -->
4061          ..............
4062      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4063      o | ++#######+++++ <- whole row
4064      l | ++#######+++++ <-     "
4065      s v ++#######..... <- last row apart from trailing remainder
4066          ..............
4067
4068      We could split very large transfers into several rectangular copies, but
4069      that is unimplemented for now.  */
4070
4071   size_t bounded_size_host, first_elem_offset_host;
4072   void *host_ptr;
4073   if (dir == hsaHostToDevice)
4074     {
4075       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4076       first_elem_offset_host = src_offset0_len * src_dim1_size
4077                                + src_offset1_size;
4078       host_ptr = (void *) src;
4079       struct agent_info *agent = get_agent_info (dst_ord);
4080       copy_agent = agent->id;
4081     }
4082   else
4083     {
4084       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4085       first_elem_offset_host = dst_offset0_len * dst_dim1_size
4086                                + dst_offset1_size;
4087       host_ptr = dst;
4088       struct agent_info *agent = get_agent_info (src_ord);
4089       copy_agent = agent->id;
4090     }
4091
4092   void *agent_ptr;
4093
4094   hsa_status_t status
4095     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4096                                       bounded_size_host, NULL, 0, &agent_ptr);
4097   /* We can't lock the host memory: don't give up though, we might still be
4098      able to use the slow path in our caller.  So, don't make this an
4099      error.  */
4100   if (status != HSA_STATUS_SUCCESS)
4101     return -1;
4102
4103   hsa_pitched_ptr_t dstpp, srcpp;
4104   hsa_dim3_t dst_offsets, src_offsets, ranges;
4105
4106   int retval = 1;
4107
4108   hsa_signal_t completion_signal;
4109   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4110   if (status != HSA_STATUS_SUCCESS)
4111     {
4112       retval = -1;
4113       goto unlock;
4114     }
4115
4116   if (dir == hsaHostToDevice)
4117     {
4118       srcpp.base = agent_ptr - first_elem_offset_host;
4119       dstpp.base = dst;
4120     }
4121   else
4122     {
4123       srcpp.base = (void *) src;
4124       dstpp.base = agent_ptr - first_elem_offset_host;
4125     }
4126
4127   srcpp.pitch = src_dim1_size;
4128   srcpp.slice = 0;
4129
4130   src_offsets.x = src_offset1_size;
4131   src_offsets.y = src_offset0_len;
4132   src_offsets.z = 0;
4133
4134   dstpp.pitch = dst_dim1_size;
4135   dstpp.slice = 0;
4136
4137   dst_offsets.x = dst_offset1_size;
4138   dst_offsets.y = dst_offset0_len;
4139   dst_offsets.z = 0;
4140
4141   ranges.x = dim1_size;
4142   ranges.y = dim0_len;
4143   ranges.z = 1;
4144
4145   status
4146     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4147                                                  &src_offsets, &ranges,
4148                                                  copy_agent, dir, 0, NULL,
4149                                                  completion_signal);
4150   /* If the rectangular copy fails, we might still be able to use the slow
4151      path.  We need to unlock the host memory though, so don't return
4152      immediately.  */
4153   if (status != HSA_STATUS_SUCCESS)
4154     retval = -1;
4155   else
4156     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4157                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4158                                         HSA_WAIT_STATE_ACTIVE);
4159
4160   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4161
4162 unlock:
4163   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4164   if (status != HSA_STATUS_SUCCESS)
4165     hsa_fatal ("Could not unlock host memory", status);
4166
4167   return retval;
4168 }
4169
4170 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4171    measured in bytes.  So we have:
4172
4173    dim2_size: number of bytes to copy on innermost dimension ("row")
4174    dim1_len: number of rows per slice to copy
4175    dim0_len: number of slices to copy
4176    dst: base pointer for destination of copy
4177    dst_offset2_size: innermost row offset (for dest), in bytes
4178    dst_offset1_len: offset, number of rows (for dest)
4179    dst_offset0_len: offset, number of slices (for dest)
4180    dst_dim2_size: whole-array dest row length, in bytes (pitch)
4181    dst_dim1_len: whole-array number of rows in slice (for dest)
4182    src: base pointer for source of copy
4183    src_offset2_size: innermost row offset (for source), in bytes
4184    src_offset1_len: offset, number of rows (for source)
4185    src_offset0_len: offset, number of slices (for source)
4186    src_dim2_size: whole-array source row length, in bytes (pitch)
4187    src_dim1_len: whole-array number of rows in slice (for source)
4188 */
4189
4190 int
4191 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4192                        size_t dim1_len, size_t dim0_len, void *dst,
4193                        size_t dst_offset2_size, size_t dst_offset1_len,
4194                        size_t dst_offset0_len, size_t dst_dim2_size,
4195                        size_t dst_dim1_len, const void *src,
4196                        size_t src_offset2_size, size_t src_offset1_len,
4197                        size_t src_offset0_len, size_t src_dim2_size,
4198                        size_t src_dim1_len)
4199 {
4200   if (!hsa_fns.hsa_amd_memory_lock_fn
4201       || !hsa_fns.hsa_amd_memory_unlock_fn
4202       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4203     return -1;
4204
4205   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
4206      out quietly if we have anything oddly-aligned rather than letting the
4207      driver raise an error.  */
4208   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4209     return -1;
4210
4211   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4212     return -1;
4213
4214   /* Only handle host to device or device to host transfers here.  */
4215   if ((dst_ord == -1 && src_ord == -1)
4216       || (dst_ord != -1 && src_ord != -1))
4217     return -1;
4218
4219   hsa_amd_copy_direction_t dir
4220     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4221   hsa_agent_t copy_agent;
4222
4223   /* We need to pin (lock) host memory before we start the transfer.  Try to
4224      lock the minimum size necessary, i.e. using partial first/last slices of
4225      the whole 3D array.  Something like this:
4226
4227          slice 0:          slice 1:          slice 2:
4228              __________        __________        __________
4229        ^    /+++++++++/    :  /+++++++++/    :  /         /
4230     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
4231      /    /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
4232          /_________/  :    /_________/  :    /_________/
4233         row --->
4234
4235      We could split very large transfers into several rectangular copies, but
4236      that is unimplemented for now.  */
4237
4238   size_t bounded_size_host, first_elem_offset_host;
4239   void *host_ptr;
4240   if (dir == hsaHostToDevice)
4241     {
4242       size_t slice_bytes = src_dim2_size * src_dim1_len;
4243       bounded_size_host = slice_bytes * (dim0_len - 1)
4244                           + src_dim2_size * (dim1_len - 1)
4245                           + dim2_size;
4246       first_elem_offset_host = src_offset0_len * slice_bytes
4247                                + src_offset1_len * src_dim2_size
4248                                + src_offset2_size;
4249       host_ptr = (void *) src;
4250       struct agent_info *agent = get_agent_info (dst_ord);
4251       copy_agent = agent->id;
4252     }
4253   else
4254     {
4255       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4256       bounded_size_host = slice_bytes * (dim0_len - 1)
4257                           + dst_dim2_size * (dim1_len - 1)
4258                           + dim2_size;
4259       first_elem_offset_host = dst_offset0_len * slice_bytes
4260                                + dst_offset1_len * dst_dim2_size
4261                                + dst_offset2_size;
4262       host_ptr = dst;
4263       struct agent_info *agent = get_agent_info (src_ord);
4264       copy_agent = agent->id;
4265     }
4266
4267   void *agent_ptr;
4268
4269   hsa_status_t status
4270     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4271                                       bounded_size_host, NULL, 0, &agent_ptr);
4272   /* We can't lock the host memory: don't give up though, we might still be
4273      able to use the slow path in our caller (maybe even with iterated memcpy2d
4274      calls).  So, don't make this an error.  */
4275   if (status != HSA_STATUS_SUCCESS)
4276     return -1;
4277
4278   hsa_pitched_ptr_t dstpp, srcpp;
4279   hsa_dim3_t dst_offsets, src_offsets, ranges;
4280
4281   int retval = 1;
4282
4283   hsa_signal_t completion_signal;
4284   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4285   if (status != HSA_STATUS_SUCCESS)
4286     {
4287       retval = -1;
4288       goto unlock;
4289     }
4290
4291   if (dir == hsaHostToDevice)
4292     {
4293       srcpp.base = agent_ptr - first_elem_offset_host;
4294       dstpp.base = dst;
4295     }
4296   else
4297     {
4298       srcpp.base = (void *) src;
4299       dstpp.base = agent_ptr - first_elem_offset_host;
4300     }
4301
4302   /* Pitch is measured in bytes.  */
4303   srcpp.pitch = src_dim2_size;
4304   /* Slice is also measured in bytes (i.e. total per-slice).  */
4305   srcpp.slice = src_dim2_size * src_dim1_len;
4306
4307   src_offsets.x = src_offset2_size;
4308   src_offsets.y = src_offset1_len;
4309   src_offsets.z = src_offset0_len;
4310
4311   /* As above.  */
4312   dstpp.pitch = dst_dim2_size;
4313   dstpp.slice = dst_dim2_size * dst_dim1_len;
4314
4315   dst_offsets.x = dst_offset2_size;
4316   dst_offsets.y = dst_offset1_len;
4317   dst_offsets.z = dst_offset0_len;
4318
4319   ranges.x = dim2_size;
4320   ranges.y = dim1_len;
4321   ranges.z = dim0_len;
4322
4323   status
4324     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4325                                                  &src_offsets, &ranges,
4326                                                  copy_agent, dir, 0, NULL,
4327                                                  completion_signal);
4328   /* If the rectangular copy fails, we might still be able to use the slow
4329      path.  We need to unlock the host memory though, so don't return
4330      immediately.  */
4331   if (status != HSA_STATUS_SUCCESS)
4332     retval = -1;
4333   else
4334     {
4335       hsa_signal_value_t sv
4336         = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4337                                               HSA_SIGNAL_CONDITION_LT, 1,
4338                                               UINT64_MAX,
4339                                               HSA_WAIT_STATE_ACTIVE);
4340       if (sv < 0)
4341         {
4342           GCN_WARNING ("async copy rect failure");
4343           retval = -1;
4344         }
4345     }
4346
4347   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4348
4349 unlock:
4350   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4351   if (status != HSA_STATUS_SUCCESS)
4352     hsa_fatal ("Could not unlock host memory", status);
4353
4354   return retval;
4355 }
4356
4357 /* }}}  */
4358 /* {{{ OpenMP Plugin API  */
4359
4360 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4361    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
4362    to a kernel_info structure, and must have previously been loaded to the
4363    specified device.  */
4364
4365 void
4366 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4367 {
4368   struct agent_info *agent = get_agent_info (device);
4369   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4370   struct GOMP_kernel_launch_attributes def;
4371   struct GOMP_kernel_launch_attributes *kla;
4372   assert (agent == kernel->agent);
4373
4374   /* If we get here then the kernel must be OpenMP.  */
4375   kernel->kind = KIND_OPENMP;
4376
4377   if (!parse_target_attributes (args, &def, &kla, agent))
4378     {
4379       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4380       return;
4381     }
4382   run_kernel (kernel, vars, kla, NULL, false);
4383 }
4384
4385 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
4386    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4387    GOMP_PLUGIN_target_task_completion when it has finished.  */
4388
4389 void
4390 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4391                         void **args, void *async_data)
4392 {
4393   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4394   struct agent_info *agent = get_agent_info (device);
4395   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4396   struct GOMP_kernel_launch_attributes def;
4397   struct GOMP_kernel_launch_attributes *kla;
4398   assert (agent == kernel->agent);
4399
4400   /* If we get here then the kernel must be OpenMP.  */
4401   kernel->kind = KIND_OPENMP;
4402
4403   if (!parse_target_attributes (args, &def, &kla, agent))
4404     {
4405       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4406       return;
4407     }
4408
4409   maybe_init_omp_async (agent);
4410   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4411   queue_push_callback (agent->omp_async_queue,
4412                        GOMP_PLUGIN_target_task_completion, async_data);
4413 }
4414
4415 /* }}} */
4416 /* {{{ OpenACC Plugin API  */
4417
4418 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
4419    already-loaded KERNEL.  */
4420
4421 void
4422 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4423                            size_t mapnum __attribute__((unused)),
4424                            void **hostaddrs __attribute__((unused)),
4425                            void **devaddrs, unsigned *dims,
4426                            void *targ_mem_desc)
4427 {
4428   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4429
4430   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4431 }
4432
4433 /* Run an asynchronous OpenACC kernel on the specified queue.  */
4434
4435 void
4436 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4437                                  size_t mapnum __attribute__((unused)),
4438                                  void **hostaddrs __attribute__((unused)),
4439                                  void **devaddrs,
4440                                  unsigned *dims, void *targ_mem_desc,
4441                                  struct goacc_asyncqueue *aq)
4442 {
4443   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4444
4445   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4446 }
4447
4448 /* Create a new asynchronous thread and queue for running future kernels.  */
4449
4450 struct goacc_asyncqueue *
4451 GOMP_OFFLOAD_openacc_async_construct (int device)
4452 {
4453   struct agent_info *agent = get_agent_info (device);
4454
4455   pthread_mutex_lock (&agent->async_queues_mutex);
4456
4457   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4458   aq->agent = get_agent_info (device);
4459   aq->prev = NULL;
4460   aq->next = agent->async_queues;
4461   if (aq->next)
4462     {
4463       aq->next->prev = aq;
4464       aq->id = aq->next->id + 1;
4465     }
4466   else
4467     aq->id = 1;
4468   agent->async_queues = aq;
4469
4470   aq->queue_first = 0;
4471   aq->queue_n = 0;
4472   aq->drain_queue_stop = 0;
4473
4474   if (pthread_mutex_init (&aq->mutex, NULL))
4475     {
4476       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4477       return false;
4478     }
4479   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4480     {
4481       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4482       return false;
4483     }
4484   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4485     {
4486       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4487       return false;
4488     }
4489
4490   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4491                                                      ASYNC_QUEUE_SIZE,
4492                                                      HSA_QUEUE_TYPE_MULTI,
4493                                                      hsa_queue_callback, NULL,
4494                                                      UINT32_MAX, UINT32_MAX,
4495                                                      &aq->hsa_queue);
4496   if (status != HSA_STATUS_SUCCESS)
4497     hsa_fatal ("Error creating command queue", status);
4498
4499   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4500   if (err != 0)
4501     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4502                        strerror (err));
4503   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4504              aq->id);
4505
4506   pthread_mutex_unlock (&agent->async_queues_mutex);
4507
4508   return aq;
4509 }
4510
4511 /* Destroy an existing asynchronous thread and queue.  Waits for any
4512    currently-running task to complete, but cancels any queued tasks.  */
4513
4514 bool
4515 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4516 {
4517   struct agent_info *agent = aq->agent;
4518
4519   finalize_async_thread (aq);
4520
4521   pthread_mutex_lock (&agent->async_queues_mutex);
4522
4523   int err;
4524   if ((err = pthread_mutex_destroy (&aq->mutex)))
4525     {
4526       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4527       goto fail;
4528     }
4529   if (pthread_cond_destroy (&aq->queue_cond_in))
4530     {
4531       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4532       goto fail;
4533     }
4534   if (pthread_cond_destroy (&aq->queue_cond_out))
4535     {
4536       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4537       goto fail;
4538     }
4539   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4540   if (status != HSA_STATUS_SUCCESS)
4541     {
4542       hsa_error ("Error destroying command queue", status);
4543       goto fail;
4544     }
4545
4546   if (aq->prev)
4547     aq->prev->next = aq->next;
4548   if (aq->next)
4549     aq->next->prev = aq->prev;
4550   if (agent->async_queues == aq)
4551     agent->async_queues = aq->next;
4552
4553   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4554
4555   free (aq);
4556   pthread_mutex_unlock (&agent->async_queues_mutex);
4557   return true;
4558
4559 fail:
4560   pthread_mutex_unlock (&agent->async_queues_mutex);
4561   return false;
4562 }
4563
4564 /* Return true if the specified async queue is currently empty.  */
4565
4566 int
4567 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4568 {
4569   return queue_empty (aq);
4570 }
4571
4572 /* Block until the specified queue has executed all its tasks and the
4573    queue is empty.  */
4574
4575 bool
4576 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4577 {
4578   wait_queue (aq);
4579   return true;
4580 }
4581
4582 /* Add a serialization point across two async queues. Any new tasks added to
4583    AQ2, after this call, will not run until all tasks on AQ1, at the time
4584    of this call, have completed.  */
4585
4586 bool
4587 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4588                                       struct goacc_asyncqueue *aq2)
4589 {
4590   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4591      scheduled to run on it up to this point.  */
4592   if (aq1 != aq2)
4593     {
4594       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4595       queue_push_asyncwait (aq2, placeholderp);
4596     }
4597   return true;
4598 }
4599
4600 /* Add an opaque callback to the given async queue.  */
4601
4602 void
4603 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4604                                            void (*fn) (void *), void *data)
4605 {
4606   queue_push_callback (aq, fn, data);
4607 }
4608
4609 /* Queue up an asynchronous data copy from host to DEVICE.  */
4610
4611 bool
4612 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4613                                      size_t n, struct goacc_asyncqueue *aq)
4614 {
4615   struct agent_info *agent = get_agent_info (device);
4616   assert (agent == aq->agent);
4617   queue_push_copy (aq, dst, src, n);
4618   return true;
4619 }
4620
4621 /* Queue up an asynchronous data copy from DEVICE to host.  */
4622
4623 bool
4624 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4625                                      size_t n, struct goacc_asyncqueue *aq)
4626 {
4627   struct agent_info *agent = get_agent_info (device);
4628   assert (agent == aq->agent);
4629   queue_push_copy (aq, dst, src, n);
4630   return true;
4631 }
4632
4633 union goacc_property_value
4634 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4635 {
4636   struct agent_info *agent = get_agent_info (device);
4637
4638   union goacc_property_value propval = { .val = 0 };
4639
4640   switch (prop)
4641     {
4642     case GOACC_PROPERTY_FREE_MEMORY:
4643       /* Not supported. */
4644       break;
4645     case GOACC_PROPERTY_MEMORY:
4646       {
4647         size_t size;
4648         hsa_region_t region = agent->data_region;
4649         hsa_status_t status =
4650           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4651         if (status == HSA_STATUS_SUCCESS)
4652           propval.val = size;
4653         break;
4654       }
4655     case GOACC_PROPERTY_NAME:
4656       propval.ptr = agent->name;
4657       break;
4658     case GOACC_PROPERTY_VENDOR:
4659       propval.ptr = agent->vendor_name;
4660       break;
4661     case GOACC_PROPERTY_DRIVER:
4662       propval.ptr = hsa_context.driver_version_s;
4663       break;
4664     }
4665
4666   return propval;
4667 }
4668
4669 /* Set up plugin-specific thread-local-data (host-side).  */
4670
4671 void *
4672 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4673 {
4674   struct gcn_thread *thread_data
4675     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4676
4677   thread_data->async = GOMP_ASYNC_SYNC;
4678
4679   return (void *) thread_data;
4680 }
4681
4682 /* Clean up plugin-specific thread-local-data.  */
4683
4684 void
4685 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4686 {
4687   free (data);
4688 }
4689
4690 /* }}} */