libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199   hsa_status_t (*hsa_amd_memory_lock_fn)
 200     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
 201      void **agent_ptr);
 202   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
 203   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
 204     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
 205      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
 206      const hsa_dim3_t *range, hsa_agent_t copy_agent,
 207      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
 208      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 209 };
 210
 211 /* Structure describing the run-time and grid properties of an HSA kernel
 212    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 213
 214 struct GOMP_kernel_launch_attributes
 215 {
 216   /* Number of dimensions the workload has.  Maximum number is 3.  */
 217   uint32_t ndim;
 218   /* Size of the grid in the three respective dimensions.  */
 219   uint32_t gdims[3];
 220   /* Size of work-groups in the respective dimensions.  */
 221   uint32_t wdims[3];
 222 };
 223
 224 /* Collection of information needed for a dispatch of a kernel from a
 225    kernel.  */
 226
 227 struct kernel_dispatch
 228 {
 229   struct agent_info *agent;
 230   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 231   void *queue;
 232   /* Pointer to a memory space used for kernel arguments passing.  */
 233   void *kernarg_address;
 234   /* Kernel object.  */
 235   uint64_t object;
 236   /* Synchronization signal used for dispatch synchronization.  */
 237   uint64_t signal;
 238   /* Private segment size.  */
 239   uint32_t private_segment_size;
 240   /* Group segment size.  */
 241   uint32_t group_segment_size;
 242 };
 243
 244 /* Structure of the kernargs segment, supporting console output.
 245
 246    This needs to match the definitions in Newlib, and the expectations
 247    in libgomp target code.  */
 248
 249 struct kernargs {
 250   struct kernargs_abi abi;
 251
 252   /* Output data.  */
 253   struct output output_data;
 254 };
 255
 256 /* A queue entry for a future asynchronous launch.  */
 257
 258 struct kernel_launch
 259 {
 260   struct kernel_info *kernel;
 261   void *vars;
 262   struct GOMP_kernel_launch_attributes kla;
 263 };
 264
 265 /* A queue entry for a future callback.  */
 266
 267 struct callback
 268 {
 269   void (*fn)(void *);
 270   void *data;
 271 };
 272
 273 /* A data struct for the copy_data callback.  */
 274
 275 struct copy_data
 276 {
 277   void *dst;
 278   const void *src;
 279   size_t len;
 280   struct goacc_asyncqueue *aq;
 281 };
 282
 283 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 284
 285 struct placeholder
 286 {
 287   int executed;
 288   pthread_cond_t cond;
 289   pthread_mutex_t mutex;
 290 };
 291
 292 /* A queue entry for a wait directive.  */
 293
 294 struct asyncwait_info
 295 {
 296   struct placeholder *placeholderp;
 297 };
 298
 299 /* Encode the type of an entry in an async queue.  */
 300
 301 enum entry_type
 302 {
 303   KERNEL_LAUNCH,
 304   CALLBACK,
 305   ASYNC_WAIT,
 306   ASYNC_PLACEHOLDER
 307 };
 308
 309 /* An entry in an async queue.  */
 310
 311 struct queue_entry
 312 {
 313   enum entry_type type;
 314   union {
 315     struct kernel_launch launch;
 316     struct callback callback;
 317     struct asyncwait_info asyncwait;
 318     struct placeholder placeholder;
 319   } u;
 320 };
 321
 322 /* An async queue header.
 323
 324    OpenMP may create one of these.
 325    OpenACC may create many.  */
 326
 327 struct goacc_asyncqueue
 328 {
 329   struct agent_info *agent;
 330   hsa_queue_t *hsa_queue;
 331
 332   pthread_t thread_drain_queue;
 333   pthread_mutex_t mutex;
 334   pthread_cond_t queue_cond_in;
 335   pthread_cond_t queue_cond_out;
 336   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 337   int queue_first;
 338   int queue_n;
 339   int drain_queue_stop;
 340
 341   int id;
 342   struct goacc_asyncqueue *prev;
 343   struct goacc_asyncqueue *next;
 344 };
 345
 346 /* Mkoffload uses this structure to describe a kernel.
 347
 348    OpenMP kernel dimensions are passed at runtime.
 349    OpenACC kernel dimensions are passed at compile time, here.  */
 350
 351 struct hsa_kernel_description
 352 {
 353   const char *name;
 354   int oacc_dims[3];  /* Only present for GCN kernels.  */
 355   int sgpr_count;
 356   int vpgr_count;
 357 };
 358
 359 /* Mkoffload uses this structure to describe an offload variable.  */
 360
 361 struct global_var_info
 362 {
 363   const char *name;
 364   void *address;
 365 };
 366
 367 /* Mkoffload uses this structure to describe all the kernels in a
 368    loadable module.  These are passed the libgomp via static constructors.  */
 369
 370 struct gcn_image_desc
 371 {
 372   struct gcn_image {
 373     size_t size;
 374     void *image;
 375   } *gcn_image;
 376   const unsigned kernel_count;
 377   struct hsa_kernel_description *kernel_infos;
 378   const unsigned ind_func_count;
 379   const unsigned global_variable_count;
 380 };
 381
 382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 383    support.
 384    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 385
 386 typedef enum {
 387   EF_AMDGPU_MACH_UNSUPPORTED = -1,
 388   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 389   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 390   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 391   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
 392   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
 393   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
 394   EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
 395 } EF_AMDGPU_MACH;
 396
 397 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 398 typedef EF_AMDGPU_MACH gcn_isa;
 399
 400 /* Description of an HSA GPU agent (device) and the program associated with
 401    it.  */
 402
 403 struct agent_info
 404 {
 405   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 406   hsa_agent_t id;
 407   /* The user-visible device number.  */
 408   int device_id;
 409   /* Whether the agent has been initialized.  The fields below are usable only
 410      if it has been.  */
 411   bool initialized;
 412
 413   /* The instruction set architecture of the device. */
 414   gcn_isa device_isa;
 415   /* Name of the agent. */
 416   char name[64];
 417   /* Name of the vendor of the agent. */
 418   char vendor_name[64];
 419   /* Command queues of the agent.  */
 420   hsa_queue_t *sync_queue;
 421   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 422   pthread_mutex_t async_queues_mutex;
 423
 424   /* The HSA memory region from which to allocate kernel arguments.  */
 425   hsa_region_t kernarg_region;
 426
 427   /* The HSA memory region from which to allocate device data.  */
 428   hsa_region_t data_region;
 429
 430   /* Allocated ephemeral memories (team arena and stack space).  */
 431   struct ephemeral_memories_list *ephemeral_memories_list;
 432   pthread_mutex_t ephemeral_memories_write_lock;
 433
 434   /* Read-write lock that protects kernels which are running or about to be run
 435      from interference with loading and unloading of images.  Needs to be
 436      locked for reading while a kernel is being run, and for writing if the
 437      list of modules is manipulated (and thus the HSA program invalidated).  */
 438   pthread_rwlock_t module_rwlock;
 439
 440   /* The module associated with this kernel.  */
 441   struct module_info *module;
 442
 443   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 444      thread should have locked agent->module_rwlock for reading before
 445      acquiring it.  */
 446   pthread_mutex_t prog_mutex;
 447   /* Flag whether the HSA program that consists of all the modules has been
 448      finalized.  */
 449   bool prog_finalized;
 450   /* HSA executable - the finalized program that is used to locate kernels.  */
 451   hsa_executable_t executable;
 452 };
 453
 454 /* Information required to identify, finalize and run any given kernel.  */
 455
 456 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 457
 458 struct kernel_info
 459 {
 460   /* Name of the kernel, required to locate it within the GCN object-code
 461      module.  */
 462   const char *name;
 463   /* The specific agent the kernel has been or will be finalized for and run
 464      on.  */
 465   struct agent_info *agent;
 466   /* The specific module where the kernel takes place.  */
 467   struct module_info *module;
 468   /* Information provided by mkoffload associated with the kernel.  */
 469   struct hsa_kernel_description *description;
 470   /* Mutex enforcing that at most once thread ever initializes a kernel for
 471      use.  A thread should have locked agent->module_rwlock for reading before
 472      acquiring it.  */
 473   pthread_mutex_t init_mutex;
 474   /* Flag indicating whether the kernel has been initialized and all fields
 475      below it contain valid data.  */
 476   bool initialized;
 477   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 478   bool initialization_failed;
 479   /* The object to be put into the dispatch queue.  */
 480   uint64_t object;
 481   /* Required size of kernel arguments.  */
 482   uint32_t kernarg_segment_size;
 483   /* Required size of group segment.  */
 484   uint32_t group_segment_size;
 485   /* Required size of private segment.  */
 486   uint32_t private_segment_size;
 487   /* Set up for OpenMP or OpenACC?  */
 488   enum offload_kind kind;
 489 };
 490
 491 /* Information about a particular GCN module, its image and kernels.  */
 492
 493 struct module_info
 494 {
 495   /* The description with which the program has registered the image.  */
 496   struct gcn_image_desc *image_desc;
 497   /* GCN heap allocation.  */
 498   struct heap *heap;
 499   /* Physical boundaries of the loaded module.  */
 500   Elf64_Addr phys_address_start;
 501   Elf64_Addr phys_address_end;
 502
 503   bool constructors_run_p;
 504   struct kernel_info *init_array_func, *fini_array_func;
 505
 506   /* Number of kernels in this module.  */
 507   int kernel_count;
 508   /* An array of kernel_info structures describing each kernel in this
 509      module.  */
 510   struct kernel_info kernels[];
 511 };
 512
 513 /* A linked list of memory arenas allocated on the device.
 514    These are used by OpenMP, as a means to optimize per-team malloc,
 515    and for host-accessible stack space.  */
 516
 517 struct ephemeral_memories_list
 518 {
 519   struct ephemeral_memories_list *next;
 520
 521   /* The size is determined by the number of teams and threads.  */
 522   size_t size;
 523   /* The device address allocated memory.  */
 524   void *address;
 525   /* A flag to prevent two asynchronous kernels trying to use the same memory.
 526      The mutex is locked until the kernel exits.  */
 527   pthread_mutex_t in_use;
 528 };
 529
 530 /* Information about the whole HSA environment and all of its agents.  */
 531
 532 struct hsa_context_info
 533 {
 534   /* Whether the structure has been initialized.  */
 535   bool initialized;
 536   /* Number of usable GPU HSA agents in the system.  */
 537   int agent_count;
 538   /* Array of agent_info structures describing the individual HSA agents.  */
 539   struct agent_info *agents;
 540   /* Driver version string. */
 541   char driver_version_s[30];
 542 };
 543
 544 /* }}}  */
 545 /* {{{ Global variables  */
 546
 547 /* Information about the whole HSA environment and all of its agents.  */
 548
 549 static struct hsa_context_info hsa_context;
 550
 551 /* HSA runtime functions that are initialized in init_hsa_context.  */
 552
 553 static struct hsa_runtime_fn_info hsa_fns;
 554
 555 /* Heap space, allocated target-side, provided for use of newlib malloc.
 556    Each module should have it's own heap allocated.
 557    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 558
 559 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 560
 561 /* Ephemeral memory sizes for each kernel launch.  */
 562
 563 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 564 static int stack_size = DEFAULT_GCN_STACK_SIZE;
 565 static int lowlat_size = -1;
 566
 567 /* Flag to decide whether print to stderr information about what is going on.
 568    Set in init_debug depending on environment variables.  */
 569
 570 static bool debug;
 571
 572 /* Flag to decide if the runtime should suppress a possible fallback to host
 573    execution.  */
 574
 575 static bool suppress_host_fallback;
 576
 577 /* Flag to locate HSA runtime shared library that is dlopened
 578    by this plug-in.  */
 579
 580 static const char *hsa_runtime_lib;
 581
 582 /* Flag to decide if the runtime should support also CPU devices (can be
 583    a simulator).  */
 584
 585 static bool support_cpu_devices;
 586
 587 /* Runtime dimension overrides.  Zero indicates default.  */
 588
 589 static int override_x_dim = 0;
 590 static int override_z_dim = 0;
 591
 592 /* }}}  */
 593 /* {{{ Debug & Diagnostic  */
 594
 595 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 596
 597 #define DEBUG_PRINT(...) \
 598   do \
 599   { \
 600     if (debug) \
 601       { \
 602         fprintf (stderr, __VA_ARGS__); \
 603       } \
 604   } \
 605   while (false);
 606
 607 /* Flush stderr if GCN_DEBUG value is set to true.  */
 608
 609 #define DEBUG_FLUSH()                           \
 610   do {                                          \
 611     if (debug)                                  \
 612       fflush (stderr);                          \
 613   } while (false)
 614
 615 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 616    is set to true.  */
 617
 618 #define DEBUG_LOG(prefix, ...)                  \
 619   do                                            \
 620     {                                           \
 621       DEBUG_PRINT (prefix);                     \
 622       DEBUG_PRINT (__VA_ARGS__);                        \
 623       DEBUG_FLUSH ();                           \
 624     } while (false)
 625
 626 /* Print a debugging message to stderr.  */
 627
 628 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 629
 630 /* Print a warning message to stderr.  */
 631
 632 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 633
 634 /* Print HSA warning STR with an HSA STATUS code.  */
 635
 636 static void
 637 hsa_warn (const char *str, hsa_status_t status)
 638 {
 639   if (!debug)
 640     return;
 641
 642   const char *hsa_error_msg = "[unknown]";
 643   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 644
 645   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 646            hsa_error_msg);
 647 }
 648
 649 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 650    and terminate execution of the current process.  */
 651
 652 static void
 653 hsa_fatal (const char *str, hsa_status_t status)
 654 {
 655   const char *hsa_error_msg = "[unknown]";
 656   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 657   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 658                      hsa_error_msg);
 659 }
 660
 661 /* Like hsa_fatal, except only report error message, and return FALSE
 662    for propagating error processing to outside of plugin.  */
 663
 664 static bool
 665 hsa_error (const char *str, hsa_status_t status)
 666 {
 667   const char *hsa_error_msg = "[unknown]";
 668   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 669   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 670                      hsa_error_msg);
 671   return false;
 672 }
 673
 674 /* Dump information about the available hardware.  */
 675
 676 static void
 677 dump_hsa_system_info (void)
 678 {
 679   hsa_status_t status;
 680
 681   hsa_endianness_t endianness;
 682   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 683                                            &endianness);
 684   if (status == HSA_STATUS_SUCCESS)
 685     switch (endianness)
 686       {
 687       case HSA_ENDIANNESS_LITTLE:
 688         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 689         break;
 690       case HSA_ENDIANNESS_BIG:
 691         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 692         break;
 693       default:
 694         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 695       }
 696   else
 697     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 698
 699   uint8_t extensions[128];
 700   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 701                                            &extensions);
 702   if (status == HSA_STATUS_SUCCESS)
 703     {
 704       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 705         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 706     }
 707   else
 708     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 709 }
 710
 711 /* Dump information about the available hardware.  */
 712
 713 static void
 714 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 715 {
 716   switch (machine_model)
 717     {
 718     case HSA_MACHINE_MODEL_SMALL:
 719       GCN_DEBUG ("%s: SMALL\n", s);
 720       break;
 721     case HSA_MACHINE_MODEL_LARGE:
 722       GCN_DEBUG ("%s: LARGE\n", s);
 723       break;
 724     default:
 725       GCN_WARNING ("%s: UNKNOWN\n", s);
 726       break;
 727     }
 728 }
 729
 730 /* Dump information about the available hardware.  */
 731
 732 static void
 733 dump_profile (hsa_profile_t profile, const char *s)
 734 {
 735   switch (profile)
 736     {
 737     case HSA_PROFILE_FULL:
 738       GCN_DEBUG ("%s: FULL\n", s);
 739       break;
 740     case HSA_PROFILE_BASE:
 741       GCN_DEBUG ("%s: BASE\n", s);
 742       break;
 743     default:
 744       GCN_WARNING ("%s: UNKNOWN\n", s);
 745       break;
 746     }
 747 }
 748
 749 /* Dump information about a device memory region.  */
 750
 751 static hsa_status_t
 752 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 753 {
 754   hsa_status_t status;
 755
 756   hsa_region_segment_t segment;
 757   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 758                                            &segment);
 759   if (status == HSA_STATUS_SUCCESS)
 760     {
 761       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 762         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 763       else if (segment == HSA_REGION_SEGMENT_READONLY)
 764         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 765       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 766         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 767       else if (segment == HSA_REGION_SEGMENT_GROUP)
 768         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 769       else
 770         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 771     }
 772   else
 773     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 774
 775   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 776     {
 777       uint32_t flags;
 778       status
 779         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 780                                           &flags);
 781       if (status == HSA_STATUS_SUCCESS)
 782         {
 783           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 784             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 785           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 786             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 787           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 788             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 789         }
 790       else
 791         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 792     }
 793
 794   size_t size;
 795   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 796   if (status == HSA_STATUS_SUCCESS)
 797     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 798   else
 799     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 800
 801   status
 802     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 803                                       &size);
 804   if (status == HSA_STATUS_SUCCESS)
 805     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 806   else
 807     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 808
 809   bool alloc_allowed;
 810   status
 811     = hsa_fns.hsa_region_get_info_fn (region,
 812                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 813                                       &alloc_allowed);
 814   if (status == HSA_STATUS_SUCCESS)
 815     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 816   else
 817     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 818
 819   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 820     return HSA_STATUS_SUCCESS;
 821
 822   status
 823     = hsa_fns.hsa_region_get_info_fn (region,
 824                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 825                                       &size);
 826   if (status == HSA_STATUS_SUCCESS)
 827     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 828   else
 829     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 830
 831   size_t align;
 832   status
 833     = hsa_fns.hsa_region_get_info_fn (region,
 834                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 835                                       &align);
 836   if (status == HSA_STATUS_SUCCESS)
 837     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 838   else
 839     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 840
 841   return HSA_STATUS_SUCCESS;
 842 }
 843
 844 /* Dump information about all the device memory regions.  */
 845
 846 static void
 847 dump_hsa_regions (hsa_agent_t agent)
 848 {
 849   hsa_status_t status;
 850   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 851                                                  dump_hsa_region,
 852                                                  NULL);
 853   if (status != HSA_STATUS_SUCCESS)
 854     hsa_error ("Dumping hsa regions failed", status);
 855 }
 856
 857 /* Dump information about the available devices.  */
 858
 859 static hsa_status_t
 860 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 861 {
 862   hsa_status_t status;
 863
 864   char buf[64];
 865   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 866                                           &buf);
 867   if (status == HSA_STATUS_SUCCESS)
 868     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 869   else
 870     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 871
 872   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 873                                           &buf);
 874   if (status == HSA_STATUS_SUCCESS)
 875     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 876   else
 877     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 878
 879   hsa_machine_model_t machine_model;
 880   status
 881     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 882                                      &machine_model);
 883   if (status == HSA_STATUS_SUCCESS)
 884     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 885   else
 886     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 887
 888   hsa_profile_t profile;
 889   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 890                                           &profile);
 891   if (status == HSA_STATUS_SUCCESS)
 892     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 893   else
 894     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 895
 896   hsa_device_type_t device_type;
 897   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 898                                           &device_type);
 899   if (status == HSA_STATUS_SUCCESS)
 900     {
 901       switch (device_type)
 902         {
 903         case HSA_DEVICE_TYPE_CPU:
 904           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 905           break;
 906         case HSA_DEVICE_TYPE_GPU:
 907           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 908           break;
 909         case HSA_DEVICE_TYPE_DSP:
 910           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 911           break;
 912         default:
 913           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 914           break;
 915         }
 916     }
 917   else
 918     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 919
 920   uint32_t cu_count;
 921   status = hsa_fns.hsa_agent_get_info_fn
 922     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 923   if (status == HSA_STATUS_SUCCESS)
 924     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 925   else
 926     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 927
 928   uint32_t size;
 929   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 930                                           &size);
 931   if (status == HSA_STATUS_SUCCESS)
 932     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 933   else
 934     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 935
 936   uint32_t max_dim;
 937   status = hsa_fns.hsa_agent_get_info_fn (agent,
 938                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 939                                           &max_dim);
 940   if (status == HSA_STATUS_SUCCESS)
 941     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 942   else
 943     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 944
 945   uint32_t max_size;
 946   status = hsa_fns.hsa_agent_get_info_fn (agent,
 947                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 948                                           &max_size);
 949   if (status == HSA_STATUS_SUCCESS)
 950     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 951   else
 952     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 953
 954   uint32_t grid_max_dim;
 955   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 956                                           &grid_max_dim);
 957   if (status == HSA_STATUS_SUCCESS)
 958     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 959   else
 960     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 961
 962   uint32_t grid_max_size;
 963   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 964                                           &grid_max_size);
 965   if (status == HSA_STATUS_SUCCESS)
 966     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 967   else
 968     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 969
 970   dump_hsa_regions (agent);
 971
 972   return HSA_STATUS_SUCCESS;
 973 }
 974
 975 /* Forward reference.  */
 976
 977 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 978
 979 /* Helper function for dump_executable_symbols.  */
 980
 981 static hsa_status_t
 982 dump_executable_symbol (hsa_executable_t executable,
 983                         hsa_executable_symbol_t symbol,
 984                         void *data __attribute__((unused)))
 985 {
 986   char *name = get_executable_symbol_name (symbol);
 987
 988   if (name)
 989     {
 990       GCN_DEBUG ("executable symbol: %s\n", name);
 991       free (name);
 992     }
 993
 994   return HSA_STATUS_SUCCESS;
 995 }
 996
 997 /* Dump all global symbol in an executable.  */
 998
 999 static void
1000 dump_executable_symbols (hsa_executable_t executable)
1001 {
1002   hsa_status_t status;
1003   status
1004     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1005                                                  dump_executable_symbol,
1006                                                  NULL);
1007   if (status != HSA_STATUS_SUCCESS)
1008     hsa_fatal ("Could not dump HSA executable symbols", status);
1009 }
1010
1011 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1012
1013 static void
1014 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1015 {
1016   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1017
1018   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1019   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1020   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1021   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1022            (void*)kernargs->abi.heap_ptr);
1023   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1024            "", (void*)kernargs->abi.arena_ptr,
1025            kernargs->abi.arena_size_per_team);
1026   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1027            "", (void*)kernargs->abi.stack_ptr,
1028            kernargs->abi.stack_size_per_thread);
1029   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1030   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1031            dispatch->private_segment_size);
1032   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1033            "", dispatch->group_segment_size);
1034   fprintf (stderr, "\n");
1035 }
1036
1037 /* }}}  */
1038 /* {{{ Utility functions  */
1039
1040 /* Cast the thread local storage to gcn_thread.  */
1041
1042 static inline struct gcn_thread *
1043 gcn_thread (void)
1044 {
1045   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1046 }
1047
1048 /* Initialize debug and suppress_host_fallback according to the environment.  */
1049
1050 static void
1051 init_environment_variables (void)
1052 {
1053   if (secure_getenv ("GCN_DEBUG"))
1054     debug = true;
1055   else
1056     debug = false;
1057
1058   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1059     suppress_host_fallback = true;
1060   else
1061     suppress_host_fallback = false;
1062
1063   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1064   if (hsa_runtime_lib == NULL)
1065     hsa_runtime_lib = "libhsa-runtime64.so.1";
1066
1067   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1068
1069   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1070   if (!x)
1071     x = secure_getenv ("GCN_NUM_GANGS");
1072   if (x)
1073     override_x_dim = atoi (x);
1074
1075   const char *z = secure_getenv ("GCN_NUM_THREADS");
1076   if (!z)
1077     z = secure_getenv ("GCN_NUM_WORKERS");
1078   if (z)
1079     override_z_dim = atoi (z);
1080
1081   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1082   if (heap)
1083     {
1084       size_t tmp = atol (heap);
1085       if (tmp)
1086         gcn_kernel_heap_size = tmp;
1087     }
1088
1089   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1090   if (arena)
1091     {
1092       int tmp = atoi (arena);
1093       if (tmp)
1094         team_arena_size = tmp;;
1095     }
1096
1097   const char *stack = secure_getenv ("GCN_STACK_SIZE");
1098   if (stack)
1099     {
1100       int tmp = atoi (stack);
1101       if (tmp)
1102         stack_size = tmp;;
1103     }
1104
1105   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1106   if (lowlat)
1107     lowlat_size = atoi (lowlat);
1108 }
1109
1110 /* Return malloc'd string with name of SYMBOL.  */
1111
1112 static char *
1113 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1114 {
1115   hsa_status_t status;
1116   char *res;
1117   uint32_t len;
1118   const hsa_executable_symbol_info_t info_name_length
1119     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1120
1121   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1122                                                       &len);
1123   if (status != HSA_STATUS_SUCCESS)
1124     {
1125       hsa_error ("Could not get length of symbol name", status);
1126       return NULL;
1127     }
1128
1129   res = GOMP_PLUGIN_malloc (len + 1);
1130
1131   const hsa_executable_symbol_info_t info_name
1132     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1133
1134   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1135
1136   if (status != HSA_STATUS_SUCCESS)
1137     {
1138       hsa_error ("Could not get symbol name", status);
1139       free (res);
1140       return NULL;
1141     }
1142
1143   res[len] = '\0';
1144
1145   return res;
1146 }
1147
1148 /* Get the number of GPU Compute Units.  */
1149
1150 static int
1151 get_cu_count (struct agent_info *agent)
1152 {
1153   uint32_t cu_count;
1154   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1155     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1156   if (status == HSA_STATUS_SUCCESS)
1157     return cu_count;
1158   else
1159     return 64;  /* The usual number for older devices.  */
1160 }
1161
1162 /* Calculate the maximum grid size for OMP threads / OACC workers.
1163    This depends on the kernel's resource usage levels.  */
1164
1165 static int
1166 limit_worker_threads (int threads)
1167 {
1168   /* FIXME Do something more inteligent here.
1169      GCN can always run 4 threads within a Compute Unit, but
1170      more than that depends on register usage.  */
1171   if (threads > 16)
1172     threads = 16;
1173   return threads;
1174 }
1175
1176 /* This sets the maximum number of teams to twice the number of GPU Compute
1177    Units to avoid memory waste and corresponding memory access faults.  */
1178
1179 static int
1180 limit_teams (int teams, struct agent_info *agent)
1181 {
1182   int max_teams = 2 * get_cu_count (agent);
1183   if (teams > max_teams)
1184     teams = max_teams;
1185   return teams;
1186 }
1187
1188 /* Parse the target attributes INPUT provided by the compiler and return true
1189    if we should run anything all.  If INPUT is NULL, fill DEF with default
1190    values, then store INPUT or DEF into *RESULT.
1191
1192    This is used for OpenMP only.  */
1193
1194 static bool
1195 parse_target_attributes (void **input,
1196                          struct GOMP_kernel_launch_attributes *def,
1197                          struct GOMP_kernel_launch_attributes **result,
1198                          struct agent_info *agent)
1199 {
1200   if (!input)
1201     GOMP_PLUGIN_fatal ("No target arguments provided");
1202
1203   bool grid_attrs_found = false;
1204   bool gcn_dims_found = false;
1205   int gcn_teams = 0;
1206   int gcn_threads = 0;
1207   while (*input)
1208     {
1209       intptr_t id = (intptr_t) *input++, val;
1210
1211       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1212         val = (intptr_t) *input++;
1213       else
1214         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1215
1216       val = (val > INT_MAX) ? INT_MAX : val;
1217
1218       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1219           && ((id & GOMP_TARGET_ARG_ID_MASK)
1220               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1221         {
1222           grid_attrs_found = true;
1223           break;
1224         }
1225       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1226                == GOMP_TARGET_ARG_DEVICE_ALL)
1227         {
1228           gcn_dims_found = true;
1229           switch (id & GOMP_TARGET_ARG_ID_MASK)
1230             {
1231             case GOMP_TARGET_ARG_NUM_TEAMS:
1232               gcn_teams = limit_teams (val, agent);
1233               break;
1234             case GOMP_TARGET_ARG_THREAD_LIMIT:
1235               gcn_threads = limit_worker_threads (val);
1236               break;
1237             default:
1238               ;
1239             }
1240         }
1241     }
1242
1243   if (gcn_dims_found)
1244     {
1245       bool gfx900_workaround_p = false;
1246
1247       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1248           && gcn_threads == 0 && override_z_dim == 0)
1249         {
1250           gfx900_workaround_p = true;
1251           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1252                        "threads to at most 4 per team.\n");
1253           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1254                        "GCN_NUM_THREADS=16\n");
1255         }
1256
1257       /* Ideally, when a dimension isn't explicitly specified, we should
1258          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1259          In practice, we tune for peak performance on BabelStream, which
1260          for OpenACC is currently 32 threads per CU.  */
1261       def->ndim = 3;
1262       if (gcn_teams <= 0 && gcn_threads <= 0)
1263         {
1264           /* Set up a reasonable number of teams and threads.  */
1265           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1266           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1267           def->gdims[2] = gcn_threads;
1268         }
1269       else if (gcn_teams <= 0 && gcn_threads > 0)
1270         {
1271           /* Auto-scale the number of teams with the number of threads.  */
1272           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1273           def->gdims[2] = gcn_threads;
1274         }
1275       else if (gcn_teams > 0 && gcn_threads <= 0)
1276         {
1277           int max_threads = gfx900_workaround_p ? 4 : 16;
1278
1279           /* Auto-scale the number of threads with the number of teams.  */
1280           def->gdims[0] = gcn_teams;
1281           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1282           if (def->gdims[2] == 0)
1283             def->gdims[2] = 1;
1284           else if (def->gdims[2] > max_threads)
1285             def->gdims[2] = max_threads;
1286         }
1287       else
1288         {
1289           def->gdims[0] = gcn_teams;
1290           def->gdims[2] = gcn_threads;
1291         }
1292       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1293       def->wdims[0] = 1;  /* Single team per work-group.  */
1294       def->wdims[1] = 64;
1295       def->wdims[2] = 16;
1296       *result = def;
1297       return true;
1298     }
1299   else if (!grid_attrs_found)
1300     {
1301       def->ndim = 1;
1302       def->gdims[0] = 1;
1303       def->gdims[1] = 1;
1304       def->gdims[2] = 1;
1305       def->wdims[0] = 1;
1306       def->wdims[1] = 1;
1307       def->wdims[2] = 1;
1308       *result = def;
1309       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1310       return true;
1311     }
1312
1313   struct GOMP_kernel_launch_attributes *kla;
1314   kla = (struct GOMP_kernel_launch_attributes *) *input;
1315   *result = kla;
1316   if (kla->ndim == 0 || kla->ndim > 3)
1317     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1318
1319   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1320   unsigned i;
1321   for (i = 0; i < kla->ndim; i++)
1322     {
1323       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1324                  kla->gdims[i], kla->wdims[i]);
1325       if (kla->gdims[i] == 0)
1326         return false;
1327     }
1328   return true;
1329 }
1330
1331 /* Return the group size given the requested GROUP size, GRID size and number
1332    of grid dimensions NDIM.  */
1333
1334 static uint32_t
1335 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1336 {
1337   if (group == 0)
1338     {
1339       /* TODO: Provide a default via environment or device characteristics.  */
1340       if (ndim == 1)
1341         group = 64;
1342       else if (ndim == 2)
1343         group = 8;
1344       else
1345         group = 4;
1346     }
1347
1348   if (group > grid)
1349     group = grid;
1350   return group;
1351 }
1352
1353 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1354
1355 static void
1356 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1357 {
1358   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1359 }
1360
1361 /* A never-called callback for the HSA command queues.  These signal events
1362    that we don't use, so we trigger an error.
1363
1364    This "queue" is not to be confused with the async queues, below.  */
1365
1366 static void
1367 hsa_queue_callback (hsa_status_t status,
1368                 hsa_queue_t *queue __attribute__ ((unused)),
1369                 void *data __attribute__ ((unused)))
1370 {
1371   hsa_fatal ("Asynchronous queue error", status);
1372 }
1373
1374 /* }}}  */
1375 /* {{{ HSA initialization  */
1376
1377 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1378
1379 static bool
1380 init_hsa_runtime_functions (void)
1381 {
1382 #define DLSYM_FN(function) \
1383   hsa_fns.function##_fn = dlsym (handle, #function); \
1384   if (hsa_fns.function##_fn == NULL) \
1385     return false;
1386 #define DLSYM_OPT_FN(function) \
1387   hsa_fns.function##_fn = dlsym (handle, #function);
1388   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1389   if (handle == NULL)
1390     return false;
1391
1392   DLSYM_FN (hsa_status_string)
1393   DLSYM_FN (hsa_system_get_info)
1394   DLSYM_FN (hsa_agent_get_info)
1395   DLSYM_FN (hsa_init)
1396   DLSYM_FN (hsa_iterate_agents)
1397   DLSYM_FN (hsa_region_get_info)
1398   DLSYM_FN (hsa_queue_create)
1399   DLSYM_FN (hsa_agent_iterate_regions)
1400   DLSYM_FN (hsa_executable_destroy)
1401   DLSYM_FN (hsa_executable_create)
1402   DLSYM_FN (hsa_executable_global_variable_define)
1403   DLSYM_FN (hsa_executable_load_code_object)
1404   DLSYM_FN (hsa_executable_freeze)
1405   DLSYM_FN (hsa_signal_create)
1406   DLSYM_FN (hsa_memory_allocate)
1407   DLSYM_FN (hsa_memory_assign_agent)
1408   DLSYM_FN (hsa_memory_copy)
1409   DLSYM_FN (hsa_memory_free)
1410   DLSYM_FN (hsa_signal_destroy)
1411   DLSYM_FN (hsa_executable_get_symbol)
1412   DLSYM_FN (hsa_executable_symbol_get_info)
1413   DLSYM_FN (hsa_executable_iterate_symbols)
1414   DLSYM_FN (hsa_queue_add_write_index_release)
1415   DLSYM_FN (hsa_queue_load_read_index_acquire)
1416   DLSYM_FN (hsa_signal_wait_acquire)
1417   DLSYM_FN (hsa_signal_store_relaxed)
1418   DLSYM_FN (hsa_signal_store_release)
1419   DLSYM_FN (hsa_signal_load_acquire)
1420   DLSYM_FN (hsa_queue_destroy)
1421   DLSYM_FN (hsa_code_object_deserialize)
1422   DLSYM_OPT_FN (hsa_amd_memory_lock)
1423   DLSYM_OPT_FN (hsa_amd_memory_unlock)
1424   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1425   return true;
1426 #undef DLSYM_OPT_FN
1427 #undef DLSYM_FN
1428 }
1429
1430 static gcn_isa isa_code (const char *isa);
1431
1432 /* Return true if the agent is a GPU and can accept of concurrent submissions
1433    from different threads.  */
1434
1435 static bool
1436 suitable_hsa_agent_p (hsa_agent_t agent)
1437 {
1438   hsa_device_type_t device_type;
1439   hsa_status_t status
1440     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1441                                      &device_type);
1442   if (status != HSA_STATUS_SUCCESS)
1443     return false;
1444
1445   switch (device_type)
1446     {
1447     case HSA_DEVICE_TYPE_GPU:
1448       {
1449         char name[64];
1450         hsa_status_t status
1451           = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1452         if (status != HSA_STATUS_SUCCESS
1453             || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1454           {
1455             GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1456                        status == HSA_STATUS_SUCCESS ? name : "invalid");
1457             return false;
1458           }
1459       }
1460       break;
1461     case HSA_DEVICE_TYPE_CPU:
1462       if (!support_cpu_devices)
1463         return false;
1464       break;
1465     default:
1466       return false;
1467     }
1468
1469   uint32_t features = 0;
1470   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1471                                           &features);
1472   if (status != HSA_STATUS_SUCCESS
1473       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1474     return false;
1475   hsa_queue_type_t queue_type;
1476   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1477                                           &queue_type);
1478   if (status != HSA_STATUS_SUCCESS
1479       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1480     return false;
1481
1482   return true;
1483 }
1484
1485 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1486    agent_count in hsa_context.  */
1487
1488 static hsa_status_t
1489 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1490 {
1491   if (suitable_hsa_agent_p (agent))
1492     hsa_context.agent_count++;
1493   return HSA_STATUS_SUCCESS;
1494 }
1495
1496 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1497    id to the describing structure in the hsa context.  The index of the
1498    structure is pointed to by DATA, increment it afterwards.  */
1499
1500 static hsa_status_t
1501 assign_agent_ids (hsa_agent_t agent, void *data)
1502 {
1503   if (suitable_hsa_agent_p (agent))
1504     {
1505       int *agent_index = (int *) data;
1506       hsa_context.agents[*agent_index].id = agent;
1507       ++*agent_index;
1508     }
1509   return HSA_STATUS_SUCCESS;
1510 }
1511
1512 /* Initialize hsa_context if it has not already been done.
1513    Return TRUE on success.  */
1514
1515 static bool
1516 init_hsa_context (void)
1517 {
1518   hsa_status_t status;
1519   int agent_index = 0;
1520
1521   if (hsa_context.initialized)
1522     return true;
1523   init_environment_variables ();
1524   if (!init_hsa_runtime_functions ())
1525     {
1526       GCN_WARNING ("Run-time could not be dynamically opened\n");
1527       if (suppress_host_fallback)
1528         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1529       return false;
1530     }
1531   status = hsa_fns.hsa_init_fn ();
1532   if (status != HSA_STATUS_SUCCESS)
1533     return hsa_error ("Run-time could not be initialized", status);
1534   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1535
1536   if (debug)
1537     dump_hsa_system_info ();
1538
1539   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1540   if (status != HSA_STATUS_SUCCESS)
1541     return hsa_error ("GCN GPU devices could not be enumerated", status);
1542   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1543
1544   hsa_context.agents
1545     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1546                                   * sizeof (struct agent_info));
1547   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1548   if (status != HSA_STATUS_SUCCESS)
1549     return hsa_error ("Scanning compute agents failed", status);
1550   if (agent_index != hsa_context.agent_count)
1551     {
1552       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1553       return false;
1554     }
1555
1556   if (debug)
1557     {
1558       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1559       if (status != HSA_STATUS_SUCCESS)
1560         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1561     }
1562
1563   uint16_t minor, major;
1564   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1565                                            &minor);
1566   if (status != HSA_STATUS_SUCCESS)
1567     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1568   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1569                                            &major);
1570   if (status != HSA_STATUS_SUCCESS)
1571     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1572
1573   size_t len = sizeof hsa_context.driver_version_s;
1574   int printed = snprintf (hsa_context.driver_version_s, len,
1575                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1576                           (unsigned short int)minor);
1577   if (printed >= len)
1578     GCN_WARNING ("HSA runtime version string was truncated."
1579                  "Version %hu.%hu is too long.", (unsigned short int)major,
1580                  (unsigned short int)minor);
1581
1582   hsa_context.initialized = true;
1583   return true;
1584 }
1585
1586 /* Verify that hsa_context has already been initialized and return the
1587    agent_info structure describing device number N.  Return NULL on error.  */
1588
1589 static struct agent_info *
1590 get_agent_info (int n)
1591 {
1592   if (!hsa_context.initialized)
1593     {
1594       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1595       return NULL;
1596     }
1597   if (n >= hsa_context.agent_count)
1598     {
1599       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1600       return NULL;
1601     }
1602   if (!hsa_context.agents[n].initialized)
1603     {
1604       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1605       return NULL;
1606     }
1607   return &hsa_context.agents[n];
1608 }
1609
1610 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1611
1612    Selects (breaks at) a suitable region of type KIND.  */
1613
1614 static hsa_status_t
1615 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1616                    hsa_region_global_flag_t kind)
1617 {
1618   hsa_status_t status;
1619   hsa_region_segment_t segment;
1620
1621   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1622                                            &segment);
1623   if (status != HSA_STATUS_SUCCESS)
1624     return status;
1625   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1626     return HSA_STATUS_SUCCESS;
1627
1628   uint32_t flags;
1629   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1630                                            &flags);
1631   if (status != HSA_STATUS_SUCCESS)
1632     return status;
1633   if (flags & kind)
1634     {
1635       *retval = region;
1636       return HSA_STATUS_INFO_BREAK;
1637     }
1638   return HSA_STATUS_SUCCESS;
1639 }
1640
1641 /* Callback of hsa_agent_iterate_regions.
1642
1643    Selects a kernargs memory region.  */
1644
1645 static hsa_status_t
1646 get_kernarg_memory_region (hsa_region_t region, void *data)
1647 {
1648   return get_memory_region (region, (hsa_region_t *)data,
1649                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1650 }
1651
1652 /* Callback of hsa_agent_iterate_regions.
1653
1654    Selects a coarse-grained memory region suitable for the heap and
1655    offload data.  */
1656
1657 static hsa_status_t
1658 get_data_memory_region (hsa_region_t region, void *data)
1659 {
1660   return get_memory_region (region, (hsa_region_t *)data,
1661                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1662 }
1663
1664 static int
1665 elf_gcn_isa_field (Elf64_Ehdr *image)
1666 {
1667   return image->e_flags & EF_AMDGPU_MACH_MASK;
1668 }
1669
1670 const static char *gcn_gfx803_s = "gfx803";
1671 const static char *gcn_gfx900_s = "gfx900";
1672 const static char *gcn_gfx906_s = "gfx906";
1673 const static char *gcn_gfx908_s = "gfx908";
1674 const static char *gcn_gfx90a_s = "gfx90a";
1675 const static char *gcn_gfx1030_s = "gfx1030";
1676 const static char *gcn_gfx1100_s = "gfx1100";
1677 const static int gcn_isa_name_len = 7;
1678
1679 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1680    support the ISA. */
1681
1682 static const char*
1683 isa_hsa_name (int isa) {
1684   switch(isa)
1685     {
1686     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1687       return gcn_gfx803_s;
1688     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1689       return gcn_gfx900_s;
1690     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1691       return gcn_gfx906_s;
1692     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1693       return gcn_gfx908_s;
1694     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1695       return gcn_gfx90a_s;
1696     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1697       return gcn_gfx1030_s;
1698     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1699       return gcn_gfx1100_s;
1700     }
1701   return NULL;
1702 }
1703
1704 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1705    with -march) or NULL if we do not support the ISA.
1706    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1707
1708 static const char*
1709 isa_gcc_name (int isa) {
1710   switch(isa)
1711     {
1712     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1713       return "fiji";
1714     default:
1715       return isa_hsa_name (isa);
1716     }
1717 }
1718
1719 /* Returns the code which is used in the GCN object code to identify the ISA with
1720    the given name (as used by the HSA runtime).  */
1721
1722 static gcn_isa
1723 isa_code(const char *isa) {
1724   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1725     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1726
1727   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1728     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1729
1730   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1731     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1732
1733   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1734     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1735
1736   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1737     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1738
1739   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1740     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1741
1742   if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1743     return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1744
1745   return EF_AMDGPU_MACH_UNSUPPORTED;
1746 }
1747
1748 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
1749
1750 static int
1751 max_isa_vgprs (int isa)
1752 {
1753   switch (isa)
1754     {
1755     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1756     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1757     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1758     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1759       return 256;
1760     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1761       return 512;
1762     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1763       return 512;  /* 512 SIMD32 = 256 wavefrontsize64.  */
1764     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1765       return 1536; /* 1536 SIMD32 = 768 wavefrontsize64.  */
1766     }
1767   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1768 }
1769
1770 /* }}}  */
1771 /* {{{ Run  */
1772
1773 /* Create or reuse a team arena and stack space.
1774
1775    Team arenas are used by OpenMP to avoid calling malloc multiple times
1776    while setting up each team.  This is purely a performance optimization.
1777
1778    The stack space is used by all kernels.  We must allocate it in such a
1779    way that the reverse offload implmentation can access the data.
1780
1781    Allocating this memory costs performance, so this function will reuse an
1782    existing allocation if a large enough one is idle.
1783    The memory lock is released, but not deallocated, when the kernel exits.  */
1784
1785 static void
1786 configure_ephemeral_memories (struct kernel_info *kernel,
1787                               struct kernargs_abi *kernargs, int num_teams,
1788                               int num_threads)
1789 {
1790   struct agent_info *agent = kernel->agent;
1791   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1792   struct ephemeral_memories_list *item;
1793
1794   int actual_arena_size = (kernel->kind == KIND_OPENMP
1795                            ? team_arena_size : 0);
1796   int actual_arena_total_size = actual_arena_size * num_teams;
1797   size_t size = (actual_arena_total_size
1798                  + num_teams * num_threads * stack_size);
1799
1800   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1801     {
1802       if (item->size < size)
1803         continue;
1804
1805       if (pthread_mutex_trylock (&item->in_use) == 0)
1806         break;
1807     }
1808
1809   if (!item)
1810     {
1811       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1812                  " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1813                  num_teams, num_threads, size);
1814
1815       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1816         {
1817           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1818           return;
1819         }
1820       item = malloc (sizeof (*item));
1821       item->size = size;
1822       item->next = NULL;
1823       *next_ptr = item;
1824
1825       if (pthread_mutex_init (&item->in_use, NULL))
1826         {
1827           GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1828           return;
1829         }
1830       if (pthread_mutex_lock (&item->in_use))
1831         {
1832           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1833           return;
1834         }
1835       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1836         {
1837           GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1838           return;
1839         }
1840
1841       hsa_status_t status;
1842       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1843                                                &item->address);
1844       if (status != HSA_STATUS_SUCCESS)
1845         hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1846       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1847                                                    HSA_ACCESS_PERMISSION_RW);
1848       if (status != HSA_STATUS_SUCCESS)
1849         hsa_fatal ("Could not assign arena & stack memory to device", status);
1850     }
1851
1852   kernargs->arena_ptr = (actual_arena_total_size
1853                          ? (uint64_t)item->address
1854                          : 0);
1855   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1856   kernargs->arena_size_per_team = actual_arena_size;
1857   kernargs->stack_size_per_thread = stack_size;
1858 }
1859
1860 /* Mark an ephemeral memory space available for reuse.  */
1861
1862 static void
1863 release_ephemeral_memories (struct agent_info* agent, void *address)
1864 {
1865   struct ephemeral_memories_list *item;
1866
1867   for (item = agent->ephemeral_memories_list; item; item = item->next)
1868     {
1869       if (item->address == address)
1870         {
1871           if (pthread_mutex_unlock (&item->in_use))
1872             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1873           return;
1874         }
1875     }
1876   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1877 }
1878
1879 /* Clean up all the allocated team arenas.  */
1880
1881 static bool
1882 destroy_ephemeral_memories (struct agent_info *agent)
1883 {
1884   struct ephemeral_memories_list *item, *next;
1885
1886   for (item = agent->ephemeral_memories_list; item; item = next)
1887     {
1888       next = item->next;
1889       hsa_fns.hsa_memory_free_fn (item->address);
1890       if (pthread_mutex_destroy (&item->in_use))
1891         {
1892           GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1893           return false;
1894         }
1895       free (item);
1896     }
1897   agent->ephemeral_memories_list = NULL;
1898
1899   return true;
1900 }
1901
1902 /* Allocate memory on a specified device.  */
1903
1904 static void *
1905 alloc_by_agent (struct agent_info *agent, size_t size)
1906 {
1907   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1908
1909   void *ptr;
1910   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1911                                                         size, &ptr);
1912   if (status != HSA_STATUS_SUCCESS)
1913     {
1914       hsa_error ("Could not allocate device memory", status);
1915       return NULL;
1916     }
1917
1918   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1919                                                HSA_ACCESS_PERMISSION_RW);
1920   if (status != HSA_STATUS_SUCCESS)
1921     {
1922       hsa_error ("Could not assign data memory to device", status);
1923       return NULL;
1924     }
1925
1926   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1927   bool profiling_dispatch_p
1928     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1929   if (profiling_dispatch_p)
1930     {
1931       acc_prof_info *prof_info = thr->prof_info;
1932       acc_event_info data_event_info;
1933       acc_api_info *api_info = thr->api_info;
1934
1935       prof_info->event_type = acc_ev_alloc;
1936
1937       data_event_info.data_event.event_type = prof_info->event_type;
1938       data_event_info.data_event.valid_bytes
1939         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1940       data_event_info.data_event.parent_construct
1941         = acc_construct_parallel;
1942       data_event_info.data_event.implicit = 1;
1943       data_event_info.data_event.tool_info = NULL;
1944       data_event_info.data_event.var_name = NULL;
1945       data_event_info.data_event.bytes = size;
1946       data_event_info.data_event.host_ptr = NULL;
1947       data_event_info.data_event.device_ptr = (void *) ptr;
1948
1949       api_info->device_api = acc_device_api_other;
1950
1951       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1952                                             api_info);
1953     }
1954
1955   return ptr;
1956 }
1957
1958 /* Create kernel dispatch data structure for given KERNEL, along with
1959    the necessary device signals and memory allocations.  */
1960
1961 static struct kernel_dispatch *
1962 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1963                         int num_threads)
1964 {
1965   struct agent_info *agent = kernel->agent;
1966   struct kernel_dispatch *shadow
1967     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1968
1969   shadow->agent = kernel->agent;
1970   shadow->object = kernel->object;
1971
1972   hsa_signal_t sync_signal;
1973   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1974   if (status != HSA_STATUS_SUCCESS)
1975     hsa_fatal ("Error creating the GCN sync signal", status);
1976
1977   shadow->signal = sync_signal.handle;
1978   shadow->private_segment_size = kernel->private_segment_size;
1979
1980   if (lowlat_size < 0)
1981     {
1982       /* Divide the LDS between the number of running teams.
1983          Allocate not less than is defined in the kernel metadata.  */
1984       int teams_per_cu = num_teams / get_cu_count (agent);
1985       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1986       shadow->group_segment_size
1987         = (kernel->group_segment_size > LDS_per_team
1988            ? kernel->group_segment_size
1989            : LDS_per_team);;
1990     }
1991   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1992     /* Ensure that there's space for the OpenMP libgomp data.  */
1993     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1994   else
1995     shadow->group_segment_size = (lowlat_size > 65536
1996                                   ? 65536
1997                                   : lowlat_size);
1998
1999   /* We expect kernels to request a single pointer, explicitly, and the
2000      rest of struct kernargs, implicitly.  If they request anything else
2001      then something is wrong.  */
2002   if (kernel->kernarg_segment_size > 8)
2003     {
2004       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2005       return NULL;
2006     }
2007
2008   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2009                                            sizeof (struct kernargs),
2010                                            &shadow->kernarg_address);
2011   if (status != HSA_STATUS_SUCCESS)
2012     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2013   struct kernargs *kernargs = shadow->kernarg_address;
2014
2015   /* Zero-initialize the output_data (minimum needed).  */
2016   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2017   kernargs->output_data.next_output = 0;
2018   for (unsigned i = 0;
2019        i < (sizeof (kernargs->output_data.queue)
2020             / sizeof (kernargs->output_data.queue[0]));
2021        i++)
2022     kernargs->output_data.queue[i].written = 0;
2023   kernargs->output_data.consumed = 0;
2024
2025   /* Pass in the heap location.  */
2026   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2027
2028   /* Create the ephemeral memory spaces.  */
2029   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2030
2031   /* Ensure we can recognize unset return values.  */
2032   kernargs->output_data.return_value = 0xcafe0000;
2033
2034   return shadow;
2035 }
2036
2037 static void
2038 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2039                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2040 {
2041   int dev_num = dev_num64;
2042   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2043                           NULL);
2044 }
2045
2046 /* Output any data written to console output from the kernel.  It is expected
2047    that this function is polled during kernel execution.
2048
2049    We print all entries from the last item printed to the next entry without
2050    a "written" flag.  If the "final" flag is set then it'll continue right to
2051    the end.
2052
2053    The print buffer is circular, but the from and to locations don't wrap when
2054    the buffer does, so the output limit is UINT_MAX.  The target blocks on
2055    output when the buffer is full.  */
2056
2057 static void
2058 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2059                 bool final)
2060 {
2061   unsigned int limit = (sizeof (kernargs->output_data.queue)
2062                         / sizeof (kernargs->output_data.queue[0]));
2063
2064   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2065                                        __ATOMIC_ACQUIRE);
2066   unsigned int to = kernargs->output_data.next_output;
2067
2068   if (from > to)
2069     {
2070       /* Overflow.  */
2071       if (final)
2072         printf ("GCN print buffer overflowed.\n");
2073       return;
2074     }
2075
2076   unsigned int i;
2077   for (i = from; i < to; i++)
2078     {
2079       struct printf_data *data = &kernargs->output_data.queue[i%limit];
2080
2081       if (!data->written && !final)
2082         break;
2083
2084       switch (data->type)
2085         {
2086         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2087         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2088         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2089         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2090         case 4:
2091           process_reverse_offload (data->value_u64[0], data->value_u64[1],
2092                                    data->value_u64[2], data->value_u64[3],
2093                                    data->value_u64[4], data->value_u64[5]);
2094           break;
2095         default: printf ("GCN print buffer error!\n"); break;
2096         }
2097       data->written = 0;
2098       __atomic_store_n (&kernargs->output_data.consumed, i+1,
2099                         __ATOMIC_RELEASE);
2100     }
2101   fflush (stdout);
2102 }
2103
2104 /* Release data structure created for a kernel dispatch in SHADOW argument,
2105    and clean up the signal and memory allocations.  */
2106
2107 static void
2108 release_kernel_dispatch (struct kernel_dispatch *shadow)
2109 {
2110   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2111
2112   struct kernargs *kernargs = shadow->kernarg_address;
2113   void *addr = (void *)kernargs->abi.arena_ptr;
2114   if (!addr)
2115     addr = (void *)kernargs->abi.stack_ptr;
2116   release_ephemeral_memories (shadow->agent, addr);
2117
2118   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2119
2120   hsa_signal_t s;
2121   s.handle = shadow->signal;
2122   hsa_fns.hsa_signal_destroy_fn (s);
2123
2124   free (shadow);
2125 }
2126
2127 /* Extract the properties from a kernel binary.  */
2128
2129 static void
2130 init_kernel_properties (struct kernel_info *kernel)
2131 {
2132   hsa_status_t status;
2133   struct agent_info *agent = kernel->agent;
2134   hsa_executable_symbol_t kernel_symbol;
2135   char *buf = alloca (strlen (kernel->name) + 4);
2136   sprintf (buf, "%s.kd", kernel->name);
2137   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2138                                                  buf, agent->id,
2139                                                  0, &kernel_symbol);
2140   if (status != HSA_STATUS_SUCCESS)
2141     {
2142       hsa_warn ("Could not find symbol for kernel in the code object", status);
2143       fprintf (stderr, "not found name: '%s'\n", buf);
2144       dump_executable_symbols (agent->executable);
2145       goto failure;
2146     }
2147   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2148   status = hsa_fns.hsa_executable_symbol_get_info_fn
2149     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2150   if (status != HSA_STATUS_SUCCESS)
2151     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2152   status = hsa_fns.hsa_executable_symbol_get_info_fn
2153     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2154      &kernel->kernarg_segment_size);
2155   if (status != HSA_STATUS_SUCCESS)
2156     hsa_fatal ("Could not get info about kernel argument size", status);
2157   status = hsa_fns.hsa_executable_symbol_get_info_fn
2158     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2159      &kernel->group_segment_size);
2160   if (status != HSA_STATUS_SUCCESS)
2161     hsa_fatal ("Could not get info about kernel group segment size", status);
2162   status = hsa_fns.hsa_executable_symbol_get_info_fn
2163     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2164      &kernel->private_segment_size);
2165   if (status != HSA_STATUS_SUCCESS)
2166     hsa_fatal ("Could not get info about kernel private segment size",
2167                status);
2168
2169   /* The kernel type is not known until something tries to launch it.  */
2170   kernel->kind = KIND_UNKNOWN;
2171
2172   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2173              "following segment sizes: \n", kernel->name);
2174   GCN_DEBUG ("  group_segment_size: %u\n",
2175              (unsigned) kernel->group_segment_size);
2176   GCN_DEBUG ("  private_segment_size: %u\n",
2177              (unsigned) kernel->private_segment_size);
2178   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2179              (unsigned) kernel->kernarg_segment_size);
2180   return;
2181
2182 failure:
2183   kernel->initialization_failed = true;
2184 }
2185
2186 /* Do all the work that is necessary before running KERNEL for the first time.
2187    The function assumes the program has been created, finalized and frozen by
2188    create_and_finalize_hsa_program.  */
2189
2190 static void
2191 init_kernel (struct kernel_info *kernel)
2192 {
2193   if (pthread_mutex_lock (&kernel->init_mutex))
2194     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2195   if (kernel->initialized)
2196     {
2197       if (pthread_mutex_unlock (&kernel->init_mutex))
2198         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2199                            "mutex");
2200
2201       return;
2202     }
2203
2204   init_kernel_properties (kernel);
2205
2206   if (!kernel->initialization_failed)
2207     {
2208       GCN_DEBUG ("\n");
2209
2210       kernel->initialized = true;
2211     }
2212   if (pthread_mutex_unlock (&kernel->init_mutex))
2213     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2214                        "mutex");
2215 }
2216
2217 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2218    launch attributes from KLA.
2219
2220    MODULE_LOCKED indicates that the caller already holds the lock and
2221    run_kernel need not lock it again.
2222    If AQ is NULL then agent->sync_queue will be used.  */
2223
2224 static void
2225 run_kernel (struct kernel_info *kernel, void *vars,
2226             struct GOMP_kernel_launch_attributes *kla,
2227             struct goacc_asyncqueue *aq, bool module_locked)
2228 {
2229   struct agent_info *agent = kernel->agent;
2230   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2231              kernel->description->vpgr_count);
2232
2233   /* Reduce the number of threads/workers if there are insufficient
2234      VGPRs available to run the kernels together.  */
2235   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2236     {
2237       int max_vgprs = max_isa_vgprs (agent->device_isa);
2238       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2239       int max_threads = (max_vgprs / granulated_vgprs) * 4;
2240       if (kla->gdims[2] > max_threads)
2241         {
2242           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2243                        " per team/gang - reducing to %d threads/workers.\n",
2244                        kla->gdims[2], max_threads);
2245           kla->gdims[2] = max_threads;
2246         }
2247     }
2248
2249   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2250              (aq ? aq->id : 0));
2251   GCN_DEBUG ("GCN launch attribs: gdims:[");
2252   int i;
2253   for (i = 0; i < kla->ndim; ++i)
2254     {
2255       if (i)
2256         DEBUG_PRINT (", ");
2257       DEBUG_PRINT ("%u", kla->gdims[i]);
2258     }
2259   DEBUG_PRINT ("], normalized gdims:[");
2260   for (i = 0; i < kla->ndim; ++i)
2261     {
2262       if (i)
2263         DEBUG_PRINT (", ");
2264       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2265     }
2266   DEBUG_PRINT ("], wdims:[");
2267   for (i = 0; i < kla->ndim; ++i)
2268     {
2269       if (i)
2270         DEBUG_PRINT (", ");
2271       DEBUG_PRINT ("%u", kla->wdims[i]);
2272     }
2273   DEBUG_PRINT ("]\n");
2274   DEBUG_FLUSH ();
2275
2276   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2277     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2278
2279   if (!agent->initialized)
2280     GOMP_PLUGIN_fatal ("Agent must be initialized");
2281
2282   if (!kernel->initialized)
2283     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2284
2285   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2286
2287   uint64_t index
2288     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2289   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2290
2291   /* Wait until the queue is not full before writing the packet.   */
2292   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2293          >= command_q->size)
2294     ;
2295
2296   /* Do not allow the dimensions to be overridden when running
2297      constructors or destructors.  */
2298   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2299   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2300
2301   hsa_kernel_dispatch_packet_t *packet;
2302   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2303            + index % command_q->size;
2304
2305   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2306   packet->grid_size_x = override_x ? : kla->gdims[0];
2307   packet->workgroup_size_x = get_group_size (kla->ndim,
2308                                              packet->grid_size_x,
2309                                              kla->wdims[0]);
2310
2311   if (kla->ndim >= 2)
2312     {
2313       packet->grid_size_y = kla->gdims[1];
2314       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2315                                                  kla->wdims[1]);
2316     }
2317   else
2318     {
2319       packet->grid_size_y = 1;
2320       packet->workgroup_size_y = 1;
2321     }
2322
2323   if (kla->ndim == 3)
2324     {
2325       packet->grid_size_z = limit_worker_threads (override_z
2326                                                   ? : kla->gdims[2]);
2327       packet->workgroup_size_z = get_group_size (kla->ndim,
2328                                                  packet->grid_size_z,
2329                                                  kla->wdims[2]);
2330     }
2331   else
2332     {
2333       packet->grid_size_z = 1;
2334       packet->workgroup_size_z = 1;
2335     }
2336
2337   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2338              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2339              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2340              packet->grid_size_x / packet->workgroup_size_x,
2341              packet->grid_size_y / packet->workgroup_size_y,
2342              packet->grid_size_z / packet->workgroup_size_z,
2343              packet->workgroup_size_x, packet->workgroup_size_y,
2344              packet->workgroup_size_z);
2345
2346   struct kernel_dispatch *shadow
2347     = create_kernel_dispatch (kernel, packet->grid_size_x,
2348                               packet->grid_size_z);
2349   shadow->queue = command_q;
2350
2351   if (debug)
2352     {
2353       fprintf (stderr, "\nKernel has following dependencies:\n");
2354       print_kernel_dispatch (shadow, 2);
2355     }
2356
2357   packet->private_segment_size = shadow->private_segment_size;
2358   packet->group_segment_size = shadow->group_segment_size;
2359   packet->kernel_object = shadow->object;
2360   packet->kernarg_address = shadow->kernarg_address;
2361   hsa_signal_t s;
2362   s.handle = shadow->signal;
2363   packet->completion_signal = s;
2364   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2365   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2366
2367   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2368
2369   uint16_t header;
2370   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2371   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2372   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2373
2374   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2375              agent->device_id);
2376
2377   packet_store_release ((uint32_t *) packet, header,
2378                         (uint16_t) kla->ndim
2379                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2380
2381   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2382                                        index);
2383
2384   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2385
2386   /* Root signal waits with 1ms timeout.  */
2387   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2388                                              1000 * 1000,
2389                                              HSA_WAIT_STATE_BLOCKED) != 0)
2390     {
2391       console_output (kernel, shadow->kernarg_address, false);
2392     }
2393   console_output (kernel, shadow->kernarg_address, true);
2394
2395   struct kernargs *kernargs = shadow->kernarg_address;
2396   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2397
2398   release_kernel_dispatch (shadow);
2399
2400   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2401     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2402
2403   unsigned int upper = (return_value & ~0xffff) >> 16;
2404   if (upper == 0xcafe)
2405     ; // exit not called, normal termination.
2406   else if (upper == 0xffff)
2407     ; // exit called.
2408   else
2409     {
2410       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2411                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2412                          return_value);
2413       abort ();
2414     }
2415
2416   if (upper == 0xffff)
2417     {
2418       unsigned int signal = (return_value >> 8) & 0xff;
2419
2420       if (signal == SIGABRT)
2421         {
2422           GCN_WARNING ("GCN Kernel aborted\n");
2423           abort ();
2424         }
2425       else if (signal != 0)
2426         {
2427           GCN_WARNING ("GCN Kernel received unknown signal\n");
2428           abort ();
2429         }
2430
2431       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2432       exit (return_value & 0xff);
2433     }
2434 }
2435
2436 /* }}}  */
2437 /* {{{ Load/Unload  */
2438
2439 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2440
2441 static bool
2442 init_basic_kernel_info (struct kernel_info *kernel,
2443                         struct hsa_kernel_description *d,
2444                         struct agent_info *agent,
2445                         struct module_info *module)
2446 {
2447   kernel->agent = agent;
2448   kernel->module = module;
2449   kernel->name = d->name;
2450   kernel->description = d;
2451   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2452     {
2453       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2454       return false;
2455     }
2456   return true;
2457 }
2458
2459 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2460
2461 static bool
2462 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2463 {
2464   int isa_field = elf_gcn_isa_field (image);
2465   const char* isa_s = isa_hsa_name (isa_field);
2466   if (!isa_s)
2467     {
2468       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2469       return false;
2470     }
2471
2472   if (isa_field != agent->device_isa)
2473     {
2474       char msg[120];
2475       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2476       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2477       assert (agent_isa_s);
2478       assert (agent_isa_gcc_s);
2479
2480       snprintf (msg, sizeof msg,
2481                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2482                 "Try to recompile with '-foffload-options=-march=%s'.\n",
2483                 isa_s, agent_isa_s, agent_isa_gcc_s);
2484
2485       hsa_error (msg, HSA_STATUS_ERROR);
2486       return false;
2487     }
2488
2489   return true;
2490 }
2491
2492 /* Create and finalize the program consisting of all loaded modules.  */
2493
2494 static bool
2495 create_and_finalize_hsa_program (struct agent_info *agent)
2496 {
2497   hsa_status_t status;
2498   bool res = true;
2499   if (pthread_mutex_lock (&agent->prog_mutex))
2500     {
2501       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2502       return false;
2503     }
2504   if (agent->prog_finalized)
2505     goto final;
2506
2507   status
2508     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2509                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2510                                         "", &agent->executable);
2511   if (status != HSA_STATUS_SUCCESS)
2512     {
2513       hsa_error ("Could not create GCN executable", status);
2514       goto fail;
2515     }
2516
2517   /* Load any GCN modules.  */
2518   struct module_info *module = agent->module;
2519   if (module)
2520     {
2521       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2522
2523       if (!isa_matches_agent (agent, image))
2524         goto fail;
2525
2526       hsa_code_object_t co = { 0 };
2527       status = hsa_fns.hsa_code_object_deserialize_fn
2528         (module->image_desc->gcn_image->image,
2529          module->image_desc->gcn_image->size,
2530          NULL, &co);
2531       if (status != HSA_STATUS_SUCCESS)
2532         {
2533           hsa_error ("Could not deserialize GCN code object", status);
2534           goto fail;
2535         }
2536
2537       status = hsa_fns.hsa_executable_load_code_object_fn
2538         (agent->executable, agent->id, co, "");
2539       if (status != HSA_STATUS_SUCCESS)
2540         {
2541           hsa_error ("Could not load GCN code object", status);
2542           goto fail;
2543         }
2544
2545       if (!module->heap)
2546         {
2547           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2548                                                    gcn_kernel_heap_size,
2549                                                    (void**)&module->heap);
2550           if (status != HSA_STATUS_SUCCESS)
2551             {
2552               hsa_error ("Could not allocate memory for GCN heap", status);
2553               goto fail;
2554             }
2555
2556           status = hsa_fns.hsa_memory_assign_agent_fn
2557                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2558           if (status != HSA_STATUS_SUCCESS)
2559             {
2560               hsa_error ("Could not assign GCN heap memory to device", status);
2561               goto fail;
2562             }
2563
2564           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2565                                       &gcn_kernel_heap_size,
2566                                       sizeof (gcn_kernel_heap_size));
2567         }
2568
2569     }
2570
2571   if (debug)
2572     dump_executable_symbols (agent->executable);
2573
2574   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2575   if (status != HSA_STATUS_SUCCESS)
2576     {
2577       hsa_error ("Could not freeze the GCN executable", status);
2578       goto fail;
2579     }
2580
2581 final:
2582   agent->prog_finalized = true;
2583
2584   if (pthread_mutex_unlock (&agent->prog_mutex))
2585     {
2586       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2587       res = false;
2588     }
2589
2590   return res;
2591
2592 fail:
2593   res = false;
2594   goto final;
2595 }
2596
2597 /* Free the HSA program in agent and everything associated with it and set
2598    agent->prog_finalized and the initialized flags of all kernels to false.
2599    Return TRUE on success.  */
2600
2601 static bool
2602 destroy_hsa_program (struct agent_info *agent)
2603 {
2604   if (!agent->prog_finalized)
2605     return true;
2606
2607   hsa_status_t status;
2608
2609   GCN_DEBUG ("Destroying the current GCN program.\n");
2610
2611   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2612   if (status != HSA_STATUS_SUCCESS)
2613     return hsa_error ("Could not destroy GCN executable", status);
2614
2615   if (agent->module)
2616     {
2617       int i;
2618       for (i = 0; i < agent->module->kernel_count; i++)
2619         agent->module->kernels[i].initialized = false;
2620
2621       if (agent->module->heap)
2622         {
2623           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2624           agent->module->heap = NULL;
2625         }
2626     }
2627   agent->prog_finalized = false;
2628   return true;
2629 }
2630
2631 /* Deinitialize all information associated with MODULE and kernels within
2632    it.  Return TRUE on success.  */
2633
2634 static bool
2635 destroy_module (struct module_info *module, bool locked)
2636 {
2637   /* Run destructors before destroying module.  */
2638   struct GOMP_kernel_launch_attributes kla =
2639     { 3,
2640       /* Grid size.  */
2641       { 1, 64, 1 },
2642       /* Work-group size.  */
2643       { 1, 64, 1 }
2644     };
2645
2646   if (module->fini_array_func)
2647     {
2648       init_kernel (module->fini_array_func);
2649       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2650     }
2651   module->constructors_run_p = false;
2652
2653   int i;
2654   for (i = 0; i < module->kernel_count; i++)
2655     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2656       {
2657         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2658                            "mutex");
2659         return false;
2660       }
2661
2662   return true;
2663 }
2664
2665 /* }}}  */
2666 /* {{{ Async  */
2667
2668 /* Callback of dispatch queues to report errors.  */
2669
2670 static void
2671 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2672 {
2673   struct queue_entry *entry = &aq->queue[index];
2674
2675   switch (entry->type)
2676     {
2677     case KERNEL_LAUNCH:
2678       if (DEBUG_QUEUES)
2679         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2680                    aq->agent->device_id, aq->id, index);
2681       run_kernel (entry->u.launch.kernel,
2682                   entry->u.launch.vars,
2683                   &entry->u.launch.kla, aq, false);
2684       if (DEBUG_QUEUES)
2685         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2686                    aq->agent->device_id, aq->id, index);
2687       break;
2688
2689     case CALLBACK:
2690       if (DEBUG_QUEUES)
2691         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2692                    aq->agent->device_id, aq->id, index);
2693       entry->u.callback.fn (entry->u.callback.data);
2694       if (DEBUG_QUEUES)
2695         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2696                    aq->agent->device_id, aq->id, index);
2697       break;
2698
2699     case ASYNC_WAIT:
2700       {
2701         /* FIXME: is it safe to access a placeholder that may already have
2702            been executed?  */
2703         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2704
2705         if (DEBUG_QUEUES)
2706           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2707                      aq->agent->device_id, aq->id, index);
2708
2709         pthread_mutex_lock (&placeholderp->mutex);
2710
2711         while (!placeholderp->executed)
2712           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2713
2714         pthread_mutex_unlock (&placeholderp->mutex);
2715
2716         if (pthread_cond_destroy (&placeholderp->cond))
2717           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2718
2719         if (pthread_mutex_destroy (&placeholderp->mutex))
2720           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2721
2722         if (DEBUG_QUEUES)
2723           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2724                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2725       }
2726       break;
2727
2728     case ASYNC_PLACEHOLDER:
2729       pthread_mutex_lock (&entry->u.placeholder.mutex);
2730       entry->u.placeholder.executed = 1;
2731       pthread_cond_signal (&entry->u.placeholder.cond);
2732       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2733       break;
2734
2735     default:
2736       GOMP_PLUGIN_fatal ("Unknown queue element");
2737     }
2738 }
2739
2740 /* This function is run as a thread to service an async queue in the
2741    background.  It runs continuously until the stop flag is set.  */
2742
2743 static void *
2744 drain_queue (void *thread_arg)
2745 {
2746   struct goacc_asyncqueue *aq = thread_arg;
2747
2748   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2749     {
2750       aq->drain_queue_stop = 2;
2751       return NULL;
2752     }
2753
2754   pthread_mutex_lock (&aq->mutex);
2755
2756   while (true)
2757     {
2758       if (aq->drain_queue_stop)
2759         break;
2760
2761       if (aq->queue_n > 0)
2762         {
2763           pthread_mutex_unlock (&aq->mutex);
2764           execute_queue_entry (aq, aq->queue_first);
2765
2766           pthread_mutex_lock (&aq->mutex);
2767           aq->queue_first = ((aq->queue_first + 1)
2768                              % ASYNC_QUEUE_SIZE);
2769           aq->queue_n--;
2770
2771           if (DEBUG_THREAD_SIGNAL)
2772             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2773                        aq->agent->device_id, aq->id);
2774           pthread_cond_broadcast (&aq->queue_cond_out);
2775           pthread_mutex_unlock (&aq->mutex);
2776
2777           if (DEBUG_QUEUES)
2778             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2779                        aq->id);
2780           pthread_mutex_lock (&aq->mutex);
2781         }
2782       else
2783         {
2784           if (DEBUG_THREAD_SLEEP)
2785             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2786                        aq->agent->device_id, aq->id);
2787           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2788           if (DEBUG_THREAD_SLEEP)
2789             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2790                        aq->agent->device_id, aq->id);
2791         }
2792     }
2793
2794   aq->drain_queue_stop = 2;
2795   if (DEBUG_THREAD_SIGNAL)
2796     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2797                aq->agent->device_id, aq->id);
2798   pthread_cond_broadcast (&aq->queue_cond_out);
2799   pthread_mutex_unlock (&aq->mutex);
2800
2801   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2802   return NULL;
2803 }
2804
2805 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2806    is not usually the case.  This is just a debug tool.  */
2807
2808 static void
2809 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2810 {
2811   pthread_mutex_lock (&aq->mutex);
2812
2813   while (aq->queue_n > 0)
2814     {
2815       execute_queue_entry (aq, aq->queue_first);
2816
2817       aq->queue_first = ((aq->queue_first + 1)
2818                          % ASYNC_QUEUE_SIZE);
2819       aq->queue_n--;
2820     }
2821
2822   pthread_mutex_unlock (&aq->mutex);
2823 }
2824
2825 /* Block the current thread until an async queue is writable.  The aq->mutex
2826    lock should be held on entry, and remains locked on exit.  */
2827
2828 static void
2829 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2830 {
2831   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2832     {
2833       /* Queue is full.  Wait for it to not be full.  */
2834       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2835         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2836     }
2837 }
2838
2839 /* Request an asynchronous kernel launch on the specified queue.  This
2840    may block if the queue is full, but returns without waiting for the
2841    kernel to run.  */
2842
2843 static void
2844 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2845                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2846 {
2847   assert (aq->agent == kernel->agent);
2848
2849   pthread_mutex_lock (&aq->mutex);
2850
2851   wait_for_queue_nonfull (aq);
2852
2853   int queue_last = ((aq->queue_first + aq->queue_n)
2854                     % ASYNC_QUEUE_SIZE);
2855   if (DEBUG_QUEUES)
2856     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2857                aq->id, queue_last);
2858
2859   aq->queue[queue_last].type = KERNEL_LAUNCH;
2860   aq->queue[queue_last].u.launch.kernel = kernel;
2861   aq->queue[queue_last].u.launch.vars = vars;
2862   aq->queue[queue_last].u.launch.kla = *kla;
2863
2864   aq->queue_n++;
2865
2866   if (DEBUG_THREAD_SIGNAL)
2867     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2868                aq->agent->device_id, aq->id);
2869   pthread_cond_signal (&aq->queue_cond_in);
2870
2871   pthread_mutex_unlock (&aq->mutex);
2872 }
2873
2874 /* Request an asynchronous callback on the specified queue.  The callback
2875    function will be called, with the given opaque data, from the appropriate
2876    async thread, when all previous items on that queue are complete.  */
2877
2878 static void
2879 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2880                      void *data)
2881 {
2882   pthread_mutex_lock (&aq->mutex);
2883
2884   wait_for_queue_nonfull (aq);
2885
2886   int queue_last = ((aq->queue_first + aq->queue_n)
2887                     % ASYNC_QUEUE_SIZE);
2888   if (DEBUG_QUEUES)
2889     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2890                aq->id, queue_last);
2891
2892   aq->queue[queue_last].type = CALLBACK;
2893   aq->queue[queue_last].u.callback.fn = fn;
2894   aq->queue[queue_last].u.callback.data = data;
2895
2896   aq->queue_n++;
2897
2898   if (DEBUG_THREAD_SIGNAL)
2899     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2900                aq->agent->device_id, aq->id);
2901   pthread_cond_signal (&aq->queue_cond_in);
2902
2903   pthread_mutex_unlock (&aq->mutex);
2904 }
2905
2906 /* Request that a given async thread wait for another thread (unspecified) to
2907    reach the given placeholder.  The wait will occur when all previous entries
2908    on the queue are complete.  A placeholder is effectively a kind of signal
2909    which simply sets a flag when encountered in a queue.  */
2910
2911 static void
2912 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2913                       struct placeholder *placeholderp)
2914 {
2915   pthread_mutex_lock (&aq->mutex);
2916
2917   wait_for_queue_nonfull (aq);
2918
2919   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2920   if (DEBUG_QUEUES)
2921     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2922                aq->id, queue_last);
2923
2924   aq->queue[queue_last].type = ASYNC_WAIT;
2925   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2926
2927   aq->queue_n++;
2928
2929   if (DEBUG_THREAD_SIGNAL)
2930     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2931                aq->agent->device_id, aq->id);
2932   pthread_cond_signal (&aq->queue_cond_in);
2933
2934   pthread_mutex_unlock (&aq->mutex);
2935 }
2936
2937 /* Add a placeholder into an async queue.  When the async thread reaches the
2938    placeholder it will set the "executed" flag to true and continue.
2939    Another thread may be waiting on this thread reaching the placeholder.  */
2940
2941 static struct placeholder *
2942 queue_push_placeholder (struct goacc_asyncqueue *aq)
2943 {
2944   struct placeholder *placeholderp;
2945
2946   pthread_mutex_lock (&aq->mutex);
2947
2948   wait_for_queue_nonfull (aq);
2949
2950   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2951   if (DEBUG_QUEUES)
2952     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2953                aq->id, queue_last);
2954
2955   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2956   placeholderp = &aq->queue[queue_last].u.placeholder;
2957
2958   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2959     {
2960       pthread_mutex_unlock (&aq->mutex);
2961       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2962     }
2963
2964   if (pthread_cond_init (&placeholderp->cond, NULL))
2965     {
2966       pthread_mutex_unlock (&aq->mutex);
2967       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2968     }
2969
2970   placeholderp->executed = 0;
2971
2972   aq->queue_n++;
2973
2974   if (DEBUG_THREAD_SIGNAL)
2975     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2976                aq->agent->device_id, aq->id);
2977   pthread_cond_signal (&aq->queue_cond_in);
2978
2979   pthread_mutex_unlock (&aq->mutex);
2980
2981   return placeholderp;
2982 }
2983
2984 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2985
2986 static void
2987 finalize_async_thread (struct goacc_asyncqueue *aq)
2988 {
2989   pthread_mutex_lock (&aq->mutex);
2990   if (aq->drain_queue_stop == 2)
2991     {
2992       pthread_mutex_unlock (&aq->mutex);
2993       return;
2994     }
2995
2996   aq->drain_queue_stop = 1;
2997
2998   if (DEBUG_THREAD_SIGNAL)
2999     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3000                aq->agent->device_id, aq->id);
3001   pthread_cond_signal (&aq->queue_cond_in);
3002
3003   while (aq->drain_queue_stop != 2)
3004     {
3005       if (DEBUG_THREAD_SLEEP)
3006         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3007                    " to sleep\n", aq->agent->device_id, aq->id);
3008       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3009       if (DEBUG_THREAD_SLEEP)
3010         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
3011                    aq->agent->device_id, aq->id);
3012     }
3013
3014   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3015              aq->id);
3016   pthread_mutex_unlock (&aq->mutex);
3017
3018   int err = pthread_join (aq->thread_drain_queue, NULL);
3019   if (err != 0)
3020     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3021                        aq->agent->device_id, aq->id, strerror (err));
3022   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3023 }
3024
3025 /* Set up an async queue for OpenMP.  There will be only one.  The
3026    implementation simply uses an OpenACC async queue.
3027    FIXME: is this thread-safe if two threads call this function?  */
3028
3029 static void
3030 maybe_init_omp_async (struct agent_info *agent)
3031 {
3032   if (!agent->omp_async_queue)
3033     agent->omp_async_queue
3034       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3035 }
3036
3037 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3038    copies from read-only pages.  */
3039
3040 static void
3041 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3042 {
3043   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3044
3045   if (status == HSA_STATUS_SUCCESS)
3046     return;
3047
3048   /* It appears that the copy fails if the source data is in a read-only page.
3049      We can't detect that easily, so try copying the data to a temporary buffer
3050      and doing the copy again if we got an error above.  */
3051
3052   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3053                "[%p:+%d]\n", (void *) src, (int) len);
3054
3055   void *src_copy = malloc (len);
3056   memcpy (src_copy, src, len);
3057   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3058   free (src_copy);
3059   if (status != HSA_STATUS_SUCCESS)
3060     GOMP_PLUGIN_error ("memory copy failed");
3061 }
3062
3063 /* Copy data to or from a device.  This is intended for use as an async
3064    callback event.  */
3065
3066 static void
3067 copy_data (void *data_)
3068 {
3069   struct copy_data *data = (struct copy_data *)data_;
3070   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3071              data->aq->agent->device_id, data->aq->id, data->len, data->src,
3072              data->dst);
3073   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3074   free (data);
3075 }
3076
3077 /* Request an asynchronous data copy, to or from a device, on a given queue.
3078    The event will be registered as a callback.  */
3079
3080 static void
3081 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3082                  size_t len)
3083 {
3084   if (DEBUG_QUEUES)
3085     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3086                aq->agent->device_id, aq->id, len, src, dst);
3087   struct copy_data *data
3088     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3089   data->dst = dst;
3090   data->src = src;
3091   data->len = len;
3092   data->aq = aq;
3093   queue_push_callback (aq, copy_data, data);
3094 }
3095
3096 /* Return true if the given queue is currently empty.  */
3097
3098 static int
3099 queue_empty (struct goacc_asyncqueue *aq)
3100 {
3101   pthread_mutex_lock (&aq->mutex);
3102   int res = aq->queue_n == 0 ? 1 : 0;
3103   pthread_mutex_unlock (&aq->mutex);
3104
3105   return res;
3106 }
3107
3108 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3109    directive.  */
3110
3111 static void
3112 wait_queue (struct goacc_asyncqueue *aq)
3113 {
3114   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3115     {
3116       drain_queue_synchronous (aq);
3117       return;
3118     }
3119
3120   pthread_mutex_lock (&aq->mutex);
3121
3122   while (aq->queue_n > 0)
3123     {
3124       if (DEBUG_THREAD_SLEEP)
3125         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3126                    aq->agent->device_id, aq->id);
3127       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3128       if (DEBUG_THREAD_SLEEP)
3129         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3130                    aq->id);
3131     }
3132
3133   pthread_mutex_unlock (&aq->mutex);
3134   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3135 }
3136
3137 /* }}}  */
3138 /* {{{ OpenACC support  */
3139
3140 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3141
3142 static void
3143 gcn_exec (struct kernel_info *kernel,
3144           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3145           struct goacc_asyncqueue *aq)
3146 {
3147   if (!GOMP_OFFLOAD_can_run (kernel))
3148     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3149
3150   /* If we get here then this must be an OpenACC kernel.  */
3151   kernel->kind = KIND_OPENACC;
3152
3153   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3154   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3155     {
3156       struct hsa_kernel_description *d
3157         = &kernel->module->image_desc->kernel_infos[i];
3158       if (d->name == kernel->name)
3159         {
3160           hsa_kernel_desc = d;
3161           break;
3162         }
3163     }
3164
3165   /* We may have statically-determined dimensions in
3166      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3167      invocation at runtime in dims[].  We allow static dimensions to take
3168      priority over dynamic dimensions when present (non-zero).  */
3169   if (hsa_kernel_desc->oacc_dims[0] > 0)
3170     dims[0] = hsa_kernel_desc->oacc_dims[0];
3171   if (hsa_kernel_desc->oacc_dims[1] > 0)
3172     dims[1] = hsa_kernel_desc->oacc_dims[1];
3173   if (hsa_kernel_desc->oacc_dims[2] > 0)
3174     dims[2] = hsa_kernel_desc->oacc_dims[2];
3175
3176   /* Ideally, when a dimension isn't explicitly specified, we should
3177      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3178      In practice, we tune for peak performance on BabelStream, which
3179      for OpenACC is currently 32 threads per CU.  */
3180   if (dims[0] == 0 && dims[1] == 0)
3181     {
3182       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3183          number.  There isn't really a correct answer for this without a clue
3184          about the problem size, so let's do a reasonable number of workers
3185          and gangs.  */
3186
3187       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3188       dims[1] = 8; /* Workers.  */
3189     }
3190   else if (dims[0] == 0 && dims[1] > 0)
3191     {
3192       /* Auto-scale the number of gangs with the requested number of workers.  */
3193       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3194     }
3195   else if (dims[0] > 0 && dims[1] == 0)
3196     {
3197       /* Auto-scale the number of workers with the requested number of gangs.  */
3198       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3199       if (dims[1] == 0)
3200         dims[1] = 1;
3201       if (dims[1] > 16)
3202         dims[1] = 16;
3203     }
3204
3205   /* The incoming dimensions are expressed in terms of gangs, workers, and
3206      vectors.  The HSA dimensions are expressed in terms of "work-items",
3207      which means multiples of vector lanes.
3208
3209      The "grid size" specifies the size of the problem space, and the
3210      "work-group size" specifies how much of that we want a single compute
3211      unit to chew on at once.
3212
3213      The three dimensions do not really correspond to hardware, but the
3214      important thing is that the HSA runtime will launch as many
3215      work-groups as it takes to process the entire grid, and each
3216      work-group will contain as many wave-fronts as it takes to process
3217      the work-items in that group.
3218
3219      Essentially, as long as we set the Y dimension to 64 (the number of
3220      vector lanes in hardware), and the Z group size to the maximum (16),
3221      then we will get the gangs (X) and workers (Z) launched as we expect.
3222
3223      The reason for the apparent reversal of vector and worker dimension
3224      order is to do with the way the run-time distributes work-items across
3225      v1 and v2.  */
3226   struct GOMP_kernel_launch_attributes kla =
3227     {3,
3228      /* Grid size.  */
3229      {dims[0], 64, dims[1]},
3230      /* Work-group size.  */
3231      {1,       64, 16}
3232     };
3233
3234   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3235   acc_prof_info *prof_info = thr->prof_info;
3236   acc_event_info enqueue_launch_event_info;
3237   acc_api_info *api_info = thr->api_info;
3238   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3239   if (profiling_dispatch_p)
3240     {
3241       prof_info->event_type = acc_ev_enqueue_launch_start;
3242
3243       enqueue_launch_event_info.launch_event.event_type
3244         = prof_info->event_type;
3245       enqueue_launch_event_info.launch_event.valid_bytes
3246         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3247       enqueue_launch_event_info.launch_event.parent_construct
3248         = acc_construct_parallel;
3249       enqueue_launch_event_info.launch_event.implicit = 1;
3250       enqueue_launch_event_info.launch_event.tool_info = NULL;
3251       enqueue_launch_event_info.launch_event.kernel_name
3252         = (char *) kernel->name;
3253       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3254       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3255       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3256
3257       api_info->device_api = acc_device_api_other;
3258
3259       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3260         &enqueue_launch_event_info, api_info);
3261     }
3262
3263   if (!async)
3264     run_kernel (kernel, devaddrs, &kla, NULL, false);
3265   else
3266     queue_push_launch (aq, kernel, devaddrs, &kla);
3267
3268   if (profiling_dispatch_p)
3269     {
3270       prof_info->event_type = acc_ev_enqueue_launch_end;
3271       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3272       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3273                                             &enqueue_launch_event_info,
3274                                             api_info);
3275     }
3276 }
3277
3278 /* }}}  */
3279 /* {{{ Generic Plugin API  */
3280
3281 /* Return the name of the accelerator, which is "gcn".  */
3282
3283 const char *
3284 GOMP_OFFLOAD_get_name (void)
3285 {
3286   return "gcn";
3287 }
3288
3289 /* Return the specific capabilities the HSA accelerator have.  */
3290
3291 unsigned int
3292 GOMP_OFFLOAD_get_caps (void)
3293 {
3294   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3295   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3296             | GOMP_OFFLOAD_CAP_OPENACC_200;
3297 }
3298
3299 /* Identify as GCN accelerator.  */
3300
3301 int
3302 GOMP_OFFLOAD_get_type (void)
3303 {
3304   return OFFLOAD_TARGET_TYPE_GCN;
3305 }
3306
3307 /* Return the libgomp version number we're compatible with.  There is
3308    no requirement for cross-version compatibility.  */
3309
3310 unsigned
3311 GOMP_OFFLOAD_version (void)
3312 {
3313   return GOMP_VERSION;
3314 }
3315
3316 /* Return the number of GCN devices on the system.  */
3317
3318 int
3319 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3320 {
3321   if (!init_hsa_context ())
3322     return 0;
3323   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3324      devices were present.  */
3325   if (hsa_context.agent_count > 0
3326       && ((omp_requires_mask
3327            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3328                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3329     return -1;
3330   return hsa_context.agent_count;
3331 }
3332
3333 /* Initialize device (agent) number N so that it can be used for computation.
3334    Return TRUE on success.  */
3335
3336 bool
3337 GOMP_OFFLOAD_init_device (int n)
3338 {
3339   if (!init_hsa_context ())
3340     return false;
3341   if (n >= hsa_context.agent_count)
3342     {
3343       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3344       return false;
3345     }
3346   struct agent_info *agent = &hsa_context.agents[n];
3347
3348   if (agent->initialized)
3349     return true;
3350
3351   agent->device_id = n;
3352
3353   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3354     {
3355       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3356       return false;
3357     }
3358   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3359     {
3360       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3361       return false;
3362     }
3363   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3364     {
3365       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3366       return false;
3367     }
3368   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3369     {
3370       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3371       return false;
3372     }
3373   agent->async_queues = NULL;
3374   agent->omp_async_queue = NULL;
3375   agent->ephemeral_memories_list = NULL;
3376
3377   uint32_t queue_size;
3378   hsa_status_t status;
3379   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3380                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3381                                           &queue_size);
3382   if (status != HSA_STATUS_SUCCESS)
3383     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3384                       status);
3385
3386   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3387                                           &agent->name);
3388   if (status != HSA_STATUS_SUCCESS)
3389     return hsa_error ("Error querying the name of the agent", status);
3390
3391   agent->device_isa = isa_code (agent->name);
3392   if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3393     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3394
3395   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3396                                           &agent->vendor_name);
3397   if (status != HSA_STATUS_SUCCESS)
3398     return hsa_error ("Error querying the vendor name of the agent", status);
3399
3400   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3401                                         HSA_QUEUE_TYPE_MULTI,
3402                                         hsa_queue_callback, NULL, UINT32_MAX,
3403                                         UINT32_MAX, &agent->sync_queue);
3404   if (status != HSA_STATUS_SUCCESS)
3405     return hsa_error ("Error creating command queue", status);
3406
3407   agent->kernarg_region.handle = (uint64_t) -1;
3408   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3409                                                  get_kernarg_memory_region,
3410                                                  &agent->kernarg_region);
3411   if (status != HSA_STATUS_SUCCESS
3412       && status != HSA_STATUS_INFO_BREAK)
3413     hsa_error ("Scanning memory regions failed", status);
3414   if (agent->kernarg_region.handle == (uint64_t) -1)
3415     {
3416       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3417                          "arguments");
3418       return false;
3419     }
3420   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3421   dump_hsa_region (agent->kernarg_region, NULL);
3422
3423   agent->data_region.handle = (uint64_t) -1;
3424   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3425                                                  get_data_memory_region,
3426                                                  &agent->data_region);
3427   if (status != HSA_STATUS_SUCCESS
3428       && status != HSA_STATUS_INFO_BREAK)
3429     hsa_error ("Scanning memory regions failed", status);
3430   if (agent->data_region.handle == (uint64_t) -1)
3431     {
3432       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3433                          "data");
3434       return false;
3435     }
3436   GCN_DEBUG ("Selected device data memory region:\n");
3437   dump_hsa_region (agent->data_region, NULL);
3438
3439   GCN_DEBUG ("GCN agent %d initialized\n", n);
3440
3441   agent->initialized = true;
3442   return true;
3443 }
3444
3445 /* Load GCN object-code module described by struct gcn_image_desc in
3446    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3447    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3448    contain the on-device addresses of the functions for reverse offload.  To be
3449    freed by the caller.  */
3450
3451 int
3452 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3453                          struct addr_pair **target_table,
3454                          uint64_t **rev_fn_table,
3455                          uint64_t *host_ind_fn_table)
3456 {
3457   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3458     {
3459       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3460                          " (expected %u, received %u)",
3461                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3462       return -1;
3463     }
3464
3465   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3466   struct agent_info *agent;
3467   struct addr_pair *pair;
3468   struct module_info *module;
3469   struct kernel_info *kernel;
3470   int kernel_count = image_desc->kernel_count;
3471   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3472                               ? image_desc->ind_func_count : 0;
3473   unsigned var_count = image_desc->global_variable_count;
3474   /* Currently, "others" is a struct of ICVS.  */
3475   int other_count = 1;
3476
3477   agent = get_agent_info (ord);
3478   if (!agent)
3479     return -1;
3480
3481   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3482     {
3483       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3484       return -1;
3485     }
3486   if (agent->prog_finalized
3487       && !destroy_hsa_program (agent))
3488     return -1;
3489
3490   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3491   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3492   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3493   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3494   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3495                              * sizeof (struct addr_pair));
3496   *target_table = pair;
3497   module = (struct module_info *)
3498     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3499                                 + kernel_count * sizeof (struct kernel_info));
3500   module->image_desc = image_desc;
3501   module->kernel_count = kernel_count;
3502   module->heap = NULL;
3503   module->constructors_run_p = false;
3504
3505   kernel = &module->kernels[0];
3506
3507   /* Allocate memory for kernel dependencies.  */
3508   for (unsigned i = 0; i < kernel_count; i++)
3509     {
3510       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3511       if (!init_basic_kernel_info (kernel, d, agent, module))
3512         return -1;
3513       if (strcmp (d->name, "_init_array") == 0)
3514         module->init_array_func = kernel;
3515       else if (strcmp (d->name, "_fini_array") == 0)
3516         module->fini_array_func = kernel;
3517       else
3518         {
3519           pair->start = (uintptr_t) kernel;
3520           pair->end = (uintptr_t) (kernel + 1);
3521           pair++;
3522         }
3523       kernel++;
3524     }
3525
3526   agent->module = module;
3527   if (pthread_rwlock_unlock (&agent->module_rwlock))
3528     {
3529       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3530       return -1;
3531     }
3532
3533   if (!create_and_finalize_hsa_program (agent))
3534     return -1;
3535
3536   if (var_count > 0)
3537     {
3538       hsa_status_t status;
3539       hsa_executable_symbol_t var_symbol;
3540       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3541                                                      ".offload_var_table",
3542                                                      agent->id,
3543                                                      0, &var_symbol);
3544
3545       if (status != HSA_STATUS_SUCCESS)
3546         hsa_fatal ("Could not find symbol for variable in the code object",
3547                    status);
3548
3549       uint64_t var_table_addr;
3550       status = hsa_fns.hsa_executable_symbol_get_info_fn
3551         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3552          &var_table_addr);
3553       if (status != HSA_STATUS_SUCCESS)
3554         hsa_fatal ("Could not extract a variable from its symbol", status);
3555
3556       struct {
3557         uint64_t addr;
3558         uint64_t size;
3559       } var_table[var_count];
3560       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3561                              (void*)var_table_addr, sizeof (var_table));
3562
3563       for (unsigned i = 0; i < var_count; i++)
3564         {
3565           pair->start = var_table[i].addr;
3566           pair->end = var_table[i].addr + var_table[i].size;
3567           GCN_DEBUG ("Found variable at %p with size %lu\n",
3568                      (void *)var_table[i].addr, var_table[i].size);
3569           pair++;
3570         }
3571     }
3572
3573   if (ind_func_count > 0)
3574     {
3575       hsa_status_t status;
3576
3577       /* Read indirect function table from image.  */
3578       hsa_executable_symbol_t ind_funcs_symbol;
3579       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3580                                                      ".offload_ind_func_table",
3581                                                      agent->id,
3582                                                      0, &ind_funcs_symbol);
3583
3584       if (status != HSA_STATUS_SUCCESS)
3585         hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3586                    "code object", status);
3587
3588       uint64_t ind_funcs_table_addr;
3589       status = hsa_fns.hsa_executable_symbol_get_info_fn
3590         (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3591          &ind_funcs_table_addr);
3592       if (status != HSA_STATUS_SUCCESS)
3593         hsa_fatal ("Could not extract a variable from its symbol", status);
3594
3595       uint64_t ind_funcs_table[ind_func_count];
3596       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3597                              (void*) ind_funcs_table_addr,
3598                              sizeof (ind_funcs_table));
3599
3600       /* Build host->target address map for indirect functions.  */
3601       uint64_t ind_fn_map[ind_func_count * 2 + 1];
3602       for (unsigned i = 0; i < ind_func_count; i++)
3603         {
3604           ind_fn_map[i * 2] = host_ind_fn_table[i];
3605           ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3606           GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3607                      i, host_ind_fn_table[i], ind_funcs_table[i]);
3608         }
3609       ind_fn_map[ind_func_count * 2] = 0;
3610
3611       /* Write the map onto the target.  */
3612       void *map_target_addr
3613         = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3614       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3615
3616       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3617                              (void*) ind_fn_map,
3618                              sizeof (ind_fn_map));
3619
3620       /* Write address of the map onto the target.  */
3621       hsa_executable_symbol_t symbol;
3622
3623       status
3624         = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3625                                                 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3626                                                 agent->id, 0, &symbol);
3627       if (status != HSA_STATUS_SUCCESS)
3628         hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3629                    status);
3630
3631       uint64_t varptr;
3632       uint32_t varsize;
3633
3634       status = hsa_fns.hsa_executable_symbol_get_info_fn
3635         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3636          &varptr);
3637       if (status != HSA_STATUS_SUCCESS)
3638         hsa_fatal ("Could not extract a variable from its symbol", status);
3639       status = hsa_fns.hsa_executable_symbol_get_info_fn
3640         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3641         &varsize);
3642       if (status != HSA_STATUS_SUCCESS)
3643         hsa_fatal ("Could not extract a variable size from its symbol",
3644                    status);
3645
3646       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3647                  varptr, varsize);
3648
3649       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3650                              &map_target_addr,
3651                              sizeof (map_target_addr));
3652     }
3653
3654   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3655
3656   hsa_status_t status;
3657   hsa_executable_symbol_t var_symbol;
3658   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3659                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3660                                                  agent->id, 0, &var_symbol);
3661   if (status == HSA_STATUS_SUCCESS)
3662     {
3663       uint64_t varptr;
3664       uint32_t varsize;
3665
3666       status = hsa_fns.hsa_executable_symbol_get_info_fn
3667         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3668          &varptr);
3669       if (status != HSA_STATUS_SUCCESS)
3670         hsa_fatal ("Could not extract a variable from its symbol", status);
3671       status = hsa_fns.hsa_executable_symbol_get_info_fn
3672         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3673          &varsize);
3674       if (status != HSA_STATUS_SUCCESS)
3675         hsa_fatal ("Could not extract a variable size from its symbol",
3676                    status);
3677
3678       pair->start = varptr;
3679       pair->end = varptr + varsize;
3680     }
3681   else
3682     {
3683       /* The variable was not in this image.  */
3684       GCN_DEBUG ("Variable not found in image: %s\n",
3685                  XSTRING (GOMP_ADDITIONAL_ICVS));
3686       pair->start = pair->end = 0;
3687     }
3688
3689   /* Ensure that constructors are run first.  */
3690   struct GOMP_kernel_launch_attributes kla =
3691     { 3,
3692       /* Grid size.  */
3693       { 1, 64, 1 },
3694       /* Work-group size.  */
3695       { 1, 64, 1 }
3696     };
3697
3698   if (module->init_array_func)
3699     {
3700       init_kernel (module->init_array_func);
3701       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3702     }
3703   module->constructors_run_p = true;
3704
3705   /* Don't report kernels that libgomp need not know about.  */
3706   if (module->init_array_func)
3707     kernel_count--;
3708   if (module->fini_array_func)
3709     kernel_count--;
3710
3711   if (rev_fn_table != NULL && kernel_count == 0)
3712     *rev_fn_table = NULL;
3713   else if (rev_fn_table != NULL)
3714     {
3715       hsa_status_t status;
3716       hsa_executable_symbol_t var_symbol;
3717       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3718                                                      ".offload_func_table",
3719                                                      agent->id, 0, &var_symbol);
3720       if (status != HSA_STATUS_SUCCESS)
3721         hsa_fatal ("Could not find symbol for variable in the code object",
3722                    status);
3723       uint64_t fn_table_addr;
3724       status = hsa_fns.hsa_executable_symbol_get_info_fn
3725         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3726          &fn_table_addr);
3727       if (status != HSA_STATUS_SUCCESS)
3728         hsa_fatal ("Could not extract a variable from its symbol", status);
3729       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3730       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3731                              (void*) fn_table_addr,
3732                              kernel_count * sizeof (uint64_t));
3733     }
3734
3735   return kernel_count + var_count + other_count;
3736 }
3737
3738 /* Unload GCN object-code module described by struct gcn_image_desc in
3739    TARGET_DATA from agent number N.  Return TRUE on success.  */
3740
3741 bool
3742 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3743 {
3744   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3745     {
3746       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3747                          " (expected %u, received %u)",
3748                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3749       return false;
3750     }
3751
3752   struct agent_info *agent;
3753   agent = get_agent_info (n);
3754   if (!agent)
3755     return false;
3756
3757   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3758     {
3759       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3760       return false;
3761     }
3762
3763   if (!agent->module || agent->module->image_desc != target_data)
3764     {
3765       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3766                          "loaded before");
3767       return false;
3768     }
3769
3770   if (!destroy_module (agent->module, true))
3771     return false;
3772   free (agent->module);
3773   agent->module = NULL;
3774   if (!destroy_hsa_program (agent))
3775     return false;
3776   if (pthread_rwlock_unlock (&agent->module_rwlock))
3777     {
3778       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3779       return false;
3780     }
3781   return true;
3782 }
3783
3784 /* Deinitialize all information and status associated with agent number N.  We
3785    do not attempt any synchronization, assuming the user and libgomp will not
3786    attempt deinitialization of a device that is in any way being used at the
3787    same time.  Return TRUE on success.  */
3788
3789 bool
3790 GOMP_OFFLOAD_fini_device (int n)
3791 {
3792   struct agent_info *agent = get_agent_info (n);
3793   if (!agent)
3794     return false;
3795
3796   if (!agent->initialized)
3797     return true;
3798
3799   if (agent->omp_async_queue)
3800     {
3801       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3802       agent->omp_async_queue = NULL;
3803     }
3804
3805   if (agent->module)
3806     {
3807       if (!destroy_module (agent->module, false))
3808         return false;
3809       free (agent->module);
3810       agent->module = NULL;
3811     }
3812
3813   if (!destroy_ephemeral_memories (agent))
3814     return false;
3815
3816   if (!destroy_hsa_program (agent))
3817     return false;
3818
3819   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3820   if (status != HSA_STATUS_SUCCESS)
3821     return hsa_error ("Error destroying command queue", status);
3822
3823   if (pthread_mutex_destroy (&agent->prog_mutex))
3824     {
3825       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3826       return false;
3827     }
3828   if (pthread_rwlock_destroy (&agent->module_rwlock))
3829     {
3830       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3831       return false;
3832     }
3833
3834   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3835     {
3836       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3837       return false;
3838     }
3839   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3840     {
3841       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3842       return false;
3843     }
3844   agent->initialized = false;
3845   return true;
3846 }
3847
3848 /* Return true if the HSA runtime can run function FN_PTR.  */
3849
3850 bool
3851 GOMP_OFFLOAD_can_run (void *fn_ptr)
3852 {
3853   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3854
3855   init_kernel (kernel);
3856   if (kernel->initialization_failed)
3857     goto failure;
3858
3859   return true;
3860
3861 failure:
3862   if (suppress_host_fallback)
3863     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3864   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3865   return false;
3866 }
3867
3868 /* Allocate memory on device N.  */
3869
3870 void *
3871 GOMP_OFFLOAD_alloc (int n, size_t size)
3872 {
3873   struct agent_info *agent = get_agent_info (n);
3874   return alloc_by_agent (agent, size);
3875 }
3876
3877 /* Free memory from device N.  */
3878
3879 bool
3880 GOMP_OFFLOAD_free (int device, void *ptr)
3881 {
3882   GCN_DEBUG ("Freeing memory on device %d\n", device);
3883
3884   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3885   if (status != HSA_STATUS_SUCCESS)
3886     {
3887       hsa_error ("Could not free device memory", status);
3888       return false;
3889     }
3890
3891   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3892   bool profiling_dispatch_p
3893     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3894   if (profiling_dispatch_p)
3895     {
3896       acc_prof_info *prof_info = thr->prof_info;
3897       acc_event_info data_event_info;
3898       acc_api_info *api_info = thr->api_info;
3899
3900       prof_info->event_type = acc_ev_free;
3901
3902       data_event_info.data_event.event_type = prof_info->event_type;
3903       data_event_info.data_event.valid_bytes
3904         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3905       data_event_info.data_event.parent_construct
3906         = acc_construct_parallel;
3907       data_event_info.data_event.implicit = 1;
3908       data_event_info.data_event.tool_info = NULL;
3909       data_event_info.data_event.var_name = NULL;
3910       data_event_info.data_event.bytes = 0;
3911       data_event_info.data_event.host_ptr = NULL;
3912       data_event_info.data_event.device_ptr = (void *) ptr;
3913
3914       api_info->device_api = acc_device_api_other;
3915
3916       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3917                                             api_info);
3918     }
3919
3920   return true;
3921 }
3922
3923 /* Copy data from DEVICE to host.  */
3924
3925 bool
3926 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3927 {
3928   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3929              src, dst);
3930   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3931   if (status != HSA_STATUS_SUCCESS)
3932     GOMP_PLUGIN_error ("memory copy failed");
3933   return true;
3934 }
3935
3936 /* Copy data from host to DEVICE.  */
3937
3938 bool
3939 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3940 {
3941   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3942              device, dst);
3943   hsa_memory_copy_wrapper (dst, src, n);
3944   return true;
3945 }
3946
3947 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3948
3949 bool
3950 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3951 {
3952   struct gcn_thread *thread_data = gcn_thread ();
3953
3954   if (thread_data && !async_synchronous_p (thread_data->async))
3955     {
3956       struct agent_info *agent = get_agent_info (device);
3957       maybe_init_omp_async (agent);
3958       queue_push_copy (agent->omp_async_queue, dst, src, n);
3959       return true;
3960     }
3961
3962   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3963              device, src, device, dst);
3964   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3965   if (status != HSA_STATUS_SUCCESS)
3966     GOMP_PLUGIN_error ("memory copy failed");
3967   return true;
3968 }
3969
3970 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3971    measured in bytes.  So we have:
3972
3973    dim1_size: number of bytes to copy on innermost dimension ("row")
3974    dim0_len: number of rows to copy
3975    dst: base pointer for destination of copy
3976    dst_offset1_size: innermost row offset (for dest), in bytes
3977    dst_offset0_len: offset, number of rows (for dest)
3978    dst_dim1_size: whole-array dest row length, in bytes (pitch)
3979    src: base pointer for source of copy
3980    src_offset1_size: innermost row offset (for source), in bytes
3981    src_offset0_len: offset, number of rows (for source)
3982    src_dim1_size: whole-array source row length, in bytes (pitch)
3983 */
3984
3985 int
3986 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3987                        size_t dim0_len, void *dst, size_t dst_offset1_size,
3988                        size_t dst_offset0_len, size_t dst_dim1_size,
3989                        const void *src, size_t src_offset1_size,
3990                        size_t src_offset0_len, size_t src_dim1_size)
3991 {
3992   if (!hsa_fns.hsa_amd_memory_lock_fn
3993       || !hsa_fns.hsa_amd_memory_unlock_fn
3994       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3995     return -1;
3996
3997   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
3998      out quietly if we have anything oddly-aligned rather than letting the
3999      driver raise an error.  */
4000   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4001     return -1;
4002
4003   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4004     return -1;
4005
4006   /* Only handle host to device or device to host transfers here.  */
4007   if ((dst_ord == -1 && src_ord == -1)
4008       || (dst_ord != -1 && src_ord != -1))
4009     return -1;
4010
4011   hsa_amd_copy_direction_t dir
4012     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4013   hsa_agent_t copy_agent;
4014
4015   /* We need to pin (lock) host memory before we start the transfer.  Try to
4016      lock the minimum size necessary, i.e. using partial first/last rows of the
4017      whole array.  Something like this:
4018
4019          rows -->
4020          ..............
4021      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4022      o | ++#######+++++ <- whole row
4023      l | ++#######+++++ <-     "
4024      s v ++#######..... <- last row apart from trailing remainder
4025          ..............
4026
4027      We could split very large transfers into several rectangular copies, but
4028      that is unimplemented for now.  */
4029
4030   size_t bounded_size_host, first_elem_offset_host;
4031   void *host_ptr;
4032   if (dir == hsaHostToDevice)
4033     {
4034       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4035       first_elem_offset_host = src_offset0_len * src_dim1_size
4036                                + src_offset1_size;
4037       host_ptr = (void *) src;
4038       struct agent_info *agent = get_agent_info (dst_ord);
4039       copy_agent = agent->id;
4040     }
4041   else
4042     {
4043       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4044       first_elem_offset_host = dst_offset0_len * dst_dim1_size
4045                                + dst_offset1_size;
4046       host_ptr = dst;
4047       struct agent_info *agent = get_agent_info (src_ord);
4048       copy_agent = agent->id;
4049     }
4050
4051   void *agent_ptr;
4052
4053   hsa_status_t status
4054     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4055                                       bounded_size_host, NULL, 0, &agent_ptr);
4056   /* We can't lock the host memory: don't give up though, we might still be
4057      able to use the slow path in our caller.  So, don't make this an
4058      error.  */
4059   if (status != HSA_STATUS_SUCCESS)
4060     return -1;
4061
4062   hsa_pitched_ptr_t dstpp, srcpp;
4063   hsa_dim3_t dst_offsets, src_offsets, ranges;
4064
4065   int retval = 1;
4066
4067   hsa_signal_t completion_signal;
4068   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4069   if (status != HSA_STATUS_SUCCESS)
4070     {
4071       retval = -1;
4072       goto unlock;
4073     }
4074
4075   if (dir == hsaHostToDevice)
4076     {
4077       srcpp.base = agent_ptr - first_elem_offset_host;
4078       dstpp.base = dst;
4079     }
4080   else
4081     {
4082       srcpp.base = (void *) src;
4083       dstpp.base = agent_ptr - first_elem_offset_host;
4084     }
4085
4086   srcpp.pitch = src_dim1_size;
4087   srcpp.slice = 0;
4088
4089   src_offsets.x = src_offset1_size;
4090   src_offsets.y = src_offset0_len;
4091   src_offsets.z = 0;
4092
4093   dstpp.pitch = dst_dim1_size;
4094   dstpp.slice = 0;
4095
4096   dst_offsets.x = dst_offset1_size;
4097   dst_offsets.y = dst_offset0_len;
4098   dst_offsets.z = 0;
4099
4100   ranges.x = dim1_size;
4101   ranges.y = dim0_len;
4102   ranges.z = 1;
4103
4104   status
4105     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4106                                                  &src_offsets, &ranges,
4107                                                  copy_agent, dir, 0, NULL,
4108                                                  completion_signal);
4109   /* If the rectangular copy fails, we might still be able to use the slow
4110      path.  We need to unlock the host memory though, so don't return
4111      immediately.  */
4112   if (status != HSA_STATUS_SUCCESS)
4113     retval = -1;
4114   else
4115     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4116                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4117                                         HSA_WAIT_STATE_ACTIVE);
4118
4119   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4120
4121 unlock:
4122   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4123   if (status != HSA_STATUS_SUCCESS)
4124     hsa_fatal ("Could not unlock host memory", status);
4125
4126   return retval;
4127 }
4128
4129 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4130    measured in bytes.  So we have:
4131
4132    dim2_size: number of bytes to copy on innermost dimension ("row")
4133    dim1_len: number of rows per slice to copy
4134    dim0_len: number of slices to copy
4135    dst: base pointer for destination of copy
4136    dst_offset2_size: innermost row offset (for dest), in bytes
4137    dst_offset1_len: offset, number of rows (for dest)
4138    dst_offset0_len: offset, number of slices (for dest)
4139    dst_dim2_size: whole-array dest row length, in bytes (pitch)
4140    dst_dim1_len: whole-array number of rows in slice (for dest)
4141    src: base pointer for source of copy
4142    src_offset2_size: innermost row offset (for source), in bytes
4143    src_offset1_len: offset, number of rows (for source)
4144    src_offset0_len: offset, number of slices (for source)
4145    src_dim2_size: whole-array source row length, in bytes (pitch)
4146    src_dim1_len: whole-array number of rows in slice (for source)
4147 */
4148
4149 int
4150 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4151                        size_t dim1_len, size_t dim0_len, void *dst,
4152                        size_t dst_offset2_size, size_t dst_offset1_len,
4153                        size_t dst_offset0_len, size_t dst_dim2_size,
4154                        size_t dst_dim1_len, const void *src,
4155                        size_t src_offset2_size, size_t src_offset1_len,
4156                        size_t src_offset0_len, size_t src_dim2_size,
4157                        size_t src_dim1_len)
4158 {
4159   if (!hsa_fns.hsa_amd_memory_lock_fn
4160       || !hsa_fns.hsa_amd_memory_unlock_fn
4161       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4162     return -1;
4163
4164   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
4165      out quietly if we have anything oddly-aligned rather than letting the
4166      driver raise an error.  */
4167   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4168     return -1;
4169
4170   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4171     return -1;
4172
4173   /* Only handle host to device or device to host transfers here.  */
4174   if ((dst_ord == -1 && src_ord == -1)
4175       || (dst_ord != -1 && src_ord != -1))
4176     return -1;
4177
4178   hsa_amd_copy_direction_t dir
4179     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4180   hsa_agent_t copy_agent;
4181
4182   /* We need to pin (lock) host memory before we start the transfer.  Try to
4183      lock the minimum size necessary, i.e. using partial first/last slices of
4184      the whole 3D array.  Something like this:
4185
4186          slice 0:          slice 1:          slice 2:
4187              __________        __________        __________
4188        ^    /+++++++++/    :  /+++++++++/    :  /         /
4189     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
4190      /    /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
4191          /_________/  :    /_________/  :    /_________/
4192         row --->
4193
4194      We could split very large transfers into several rectangular copies, but
4195      that is unimplemented for now.  */
4196
4197   size_t bounded_size_host, first_elem_offset_host;
4198   void *host_ptr;
4199   if (dir == hsaHostToDevice)
4200     {
4201       size_t slice_bytes = src_dim2_size * src_dim1_len;
4202       bounded_size_host = slice_bytes * (dim0_len - 1)
4203                           + src_dim2_size * (dim1_len - 1)
4204                           + dim2_size;
4205       first_elem_offset_host = src_offset0_len * slice_bytes
4206                                + src_offset1_len * src_dim2_size
4207                                + src_offset2_size;
4208       host_ptr = (void *) src;
4209       struct agent_info *agent = get_agent_info (dst_ord);
4210       copy_agent = agent->id;
4211     }
4212   else
4213     {
4214       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4215       bounded_size_host = slice_bytes * (dim0_len - 1)
4216                           + dst_dim2_size * (dim1_len - 1)
4217                           + dim2_size;
4218       first_elem_offset_host = dst_offset0_len * slice_bytes
4219                                + dst_offset1_len * dst_dim2_size
4220                                + dst_offset2_size;
4221       host_ptr = dst;
4222       struct agent_info *agent = get_agent_info (src_ord);
4223       copy_agent = agent->id;
4224     }
4225
4226   void *agent_ptr;
4227
4228   hsa_status_t status
4229     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4230                                       bounded_size_host, NULL, 0, &agent_ptr);
4231   /* We can't lock the host memory: don't give up though, we might still be
4232      able to use the slow path in our caller (maybe even with iterated memcpy2d
4233      calls).  So, don't make this an error.  */
4234   if (status != HSA_STATUS_SUCCESS)
4235     return -1;
4236
4237   hsa_pitched_ptr_t dstpp, srcpp;
4238   hsa_dim3_t dst_offsets, src_offsets, ranges;
4239
4240   int retval = 1;
4241
4242   hsa_signal_t completion_signal;
4243   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4244   if (status != HSA_STATUS_SUCCESS)
4245     {
4246       retval = -1;
4247       goto unlock;
4248     }
4249
4250   if (dir == hsaHostToDevice)
4251     {
4252       srcpp.base = agent_ptr - first_elem_offset_host;
4253       dstpp.base = dst;
4254     }
4255   else
4256     {
4257       srcpp.base = (void *) src;
4258       dstpp.base = agent_ptr - first_elem_offset_host;
4259     }
4260
4261   /* Pitch is measured in bytes.  */
4262   srcpp.pitch = src_dim2_size;
4263   /* Slice is also measured in bytes (i.e. total per-slice).  */
4264   srcpp.slice = src_dim2_size * src_dim1_len;
4265
4266   src_offsets.x = src_offset2_size;
4267   src_offsets.y = src_offset1_len;
4268   src_offsets.z = src_offset0_len;
4269
4270   /* As above.  */
4271   dstpp.pitch = dst_dim2_size;
4272   dstpp.slice = dst_dim2_size * dst_dim1_len;
4273
4274   dst_offsets.x = dst_offset2_size;
4275   dst_offsets.y = dst_offset1_len;
4276   dst_offsets.z = dst_offset0_len;
4277
4278   ranges.x = dim2_size;
4279   ranges.y = dim1_len;
4280   ranges.z = dim0_len;
4281
4282   status
4283     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4284                                                  &src_offsets, &ranges,
4285                                                  copy_agent, dir, 0, NULL,
4286                                                  completion_signal);
4287   /* If the rectangular copy fails, we might still be able to use the slow
4288      path.  We need to unlock the host memory though, so don't return
4289      immediately.  */
4290   if (status != HSA_STATUS_SUCCESS)
4291     retval = -1;
4292   else
4293     {
4294       hsa_signal_value_t sv
4295         = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4296                                               HSA_SIGNAL_CONDITION_LT, 1,
4297                                               UINT64_MAX,
4298                                               HSA_WAIT_STATE_ACTIVE);
4299       if (sv < 0)
4300         {
4301           GCN_WARNING ("async copy rect failure");
4302           retval = -1;
4303         }
4304     }
4305
4306   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4307
4308 unlock:
4309   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4310   if (status != HSA_STATUS_SUCCESS)
4311     hsa_fatal ("Could not unlock host memory", status);
4312
4313   return retval;
4314 }
4315
4316 /* }}}  */
4317 /* {{{ OpenMP Plugin API  */
4318
4319 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4320    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
4321    to a kernel_info structure, and must have previously been loaded to the
4322    specified device.  */
4323
4324 void
4325 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4326 {
4327   struct agent_info *agent = get_agent_info (device);
4328   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4329   struct GOMP_kernel_launch_attributes def;
4330   struct GOMP_kernel_launch_attributes *kla;
4331   assert (agent == kernel->agent);
4332
4333   /* If we get here then the kernel must be OpenMP.  */
4334   kernel->kind = KIND_OPENMP;
4335
4336   if (!parse_target_attributes (args, &def, &kla, agent))
4337     {
4338       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4339       return;
4340     }
4341   run_kernel (kernel, vars, kla, NULL, false);
4342 }
4343
4344 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
4345    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4346    GOMP_PLUGIN_target_task_completion when it has finished.  */
4347
4348 void
4349 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4350                         void **args, void *async_data)
4351 {
4352   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4353   struct agent_info *agent = get_agent_info (device);
4354   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4355   struct GOMP_kernel_launch_attributes def;
4356   struct GOMP_kernel_launch_attributes *kla;
4357   assert (agent == kernel->agent);
4358
4359   /* If we get here then the kernel must be OpenMP.  */
4360   kernel->kind = KIND_OPENMP;
4361
4362   if (!parse_target_attributes (args, &def, &kla, agent))
4363     {
4364       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4365       return;
4366     }
4367
4368   maybe_init_omp_async (agent);
4369   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4370   queue_push_callback (agent->omp_async_queue,
4371                        GOMP_PLUGIN_target_task_completion, async_data);
4372 }
4373
4374 /* }}} */
4375 /* {{{ OpenACC Plugin API  */
4376
4377 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
4378    already-loaded KERNEL.  */
4379
4380 void
4381 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4382                            size_t mapnum __attribute__((unused)),
4383                            void **hostaddrs __attribute__((unused)),
4384                            void **devaddrs, unsigned *dims,
4385                            void *targ_mem_desc)
4386 {
4387   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4388
4389   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4390 }
4391
4392 /* Run an asynchronous OpenACC kernel on the specified queue.  */
4393
4394 void
4395 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4396                                  size_t mapnum __attribute__((unused)),
4397                                  void **hostaddrs __attribute__((unused)),
4398                                  void **devaddrs,
4399                                  unsigned *dims, void *targ_mem_desc,
4400                                  struct goacc_asyncqueue *aq)
4401 {
4402   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4403
4404   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4405 }
4406
4407 /* Create a new asynchronous thread and queue for running future kernels.  */
4408
4409 struct goacc_asyncqueue *
4410 GOMP_OFFLOAD_openacc_async_construct (int device)
4411 {
4412   struct agent_info *agent = get_agent_info (device);
4413
4414   pthread_mutex_lock (&agent->async_queues_mutex);
4415
4416   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4417   aq->agent = get_agent_info (device);
4418   aq->prev = NULL;
4419   aq->next = agent->async_queues;
4420   if (aq->next)
4421     {
4422       aq->next->prev = aq;
4423       aq->id = aq->next->id + 1;
4424     }
4425   else
4426     aq->id = 1;
4427   agent->async_queues = aq;
4428
4429   aq->queue_first = 0;
4430   aq->queue_n = 0;
4431   aq->drain_queue_stop = 0;
4432
4433   if (pthread_mutex_init (&aq->mutex, NULL))
4434     {
4435       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4436       return false;
4437     }
4438   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4439     {
4440       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4441       return false;
4442     }
4443   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4444     {
4445       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4446       return false;
4447     }
4448
4449   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4450                                                      ASYNC_QUEUE_SIZE,
4451                                                      HSA_QUEUE_TYPE_MULTI,
4452                                                      hsa_queue_callback, NULL,
4453                                                      UINT32_MAX, UINT32_MAX,
4454                                                      &aq->hsa_queue);
4455   if (status != HSA_STATUS_SUCCESS)
4456     hsa_fatal ("Error creating command queue", status);
4457
4458   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4459   if (err != 0)
4460     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4461                        strerror (err));
4462   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4463              aq->id);
4464
4465   pthread_mutex_unlock (&agent->async_queues_mutex);
4466
4467   return aq;
4468 }
4469
4470 /* Destroy an existing asynchronous thread and queue.  Waits for any
4471    currently-running task to complete, but cancels any queued tasks.  */
4472
4473 bool
4474 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4475 {
4476   struct agent_info *agent = aq->agent;
4477
4478   finalize_async_thread (aq);
4479
4480   pthread_mutex_lock (&agent->async_queues_mutex);
4481
4482   int err;
4483   if ((err = pthread_mutex_destroy (&aq->mutex)))
4484     {
4485       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4486       goto fail;
4487     }
4488   if (pthread_cond_destroy (&aq->queue_cond_in))
4489     {
4490       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4491       goto fail;
4492     }
4493   if (pthread_cond_destroy (&aq->queue_cond_out))
4494     {
4495       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4496       goto fail;
4497     }
4498   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4499   if (status != HSA_STATUS_SUCCESS)
4500     {
4501       hsa_error ("Error destroying command queue", status);
4502       goto fail;
4503     }
4504
4505   if (aq->prev)
4506     aq->prev->next = aq->next;
4507   if (aq->next)
4508     aq->next->prev = aq->prev;
4509   if (agent->async_queues == aq)
4510     agent->async_queues = aq->next;
4511
4512   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4513
4514   free (aq);
4515   pthread_mutex_unlock (&agent->async_queues_mutex);
4516   return true;
4517
4518 fail:
4519   pthread_mutex_unlock (&agent->async_queues_mutex);
4520   return false;
4521 }
4522
4523 /* Return true if the specified async queue is currently empty.  */
4524
4525 int
4526 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4527 {
4528   return queue_empty (aq);
4529 }
4530
4531 /* Block until the specified queue has executed all its tasks and the
4532    queue is empty.  */
4533
4534 bool
4535 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4536 {
4537   wait_queue (aq);
4538   return true;
4539 }
4540
4541 /* Add a serialization point across two async queues. Any new tasks added to
4542    AQ2, after this call, will not run until all tasks on AQ1, at the time
4543    of this call, have completed.  */
4544
4545 bool
4546 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4547                                       struct goacc_asyncqueue *aq2)
4548 {
4549   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4550      scheduled to run on it up to this point.  */
4551   if (aq1 != aq2)
4552     {
4553       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4554       queue_push_asyncwait (aq2, placeholderp);
4555     }
4556   return true;
4557 }
4558
4559 /* Add an opaque callback to the given async queue.  */
4560
4561 void
4562 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4563                                            void (*fn) (void *), void *data)
4564 {
4565   queue_push_callback (aq, fn, data);
4566 }
4567
4568 /* Queue up an asynchronous data copy from host to DEVICE.  */
4569
4570 bool
4571 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4572                                      size_t n, struct goacc_asyncqueue *aq)
4573 {
4574   struct agent_info *agent = get_agent_info (device);
4575   assert (agent == aq->agent);
4576   queue_push_copy (aq, dst, src, n);
4577   return true;
4578 }
4579
4580 /* Queue up an asynchronous data copy from DEVICE to host.  */
4581
4582 bool
4583 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4584                                      size_t n, struct goacc_asyncqueue *aq)
4585 {
4586   struct agent_info *agent = get_agent_info (device);
4587   assert (agent == aq->agent);
4588   queue_push_copy (aq, dst, src, n);
4589   return true;
4590 }
4591
4592 union goacc_property_value
4593 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4594 {
4595   struct agent_info *agent = get_agent_info (device);
4596
4597   union goacc_property_value propval = { .val = 0 };
4598
4599   switch (prop)
4600     {
4601     case GOACC_PROPERTY_FREE_MEMORY:
4602       /* Not supported. */
4603       break;
4604     case GOACC_PROPERTY_MEMORY:
4605       {
4606         size_t size;
4607         hsa_region_t region = agent->data_region;
4608         hsa_status_t status =
4609           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4610         if (status == HSA_STATUS_SUCCESS)
4611           propval.val = size;
4612         break;
4613       }
4614     case GOACC_PROPERTY_NAME:
4615       propval.ptr = agent->name;
4616       break;
4617     case GOACC_PROPERTY_VENDOR:
4618       propval.ptr = agent->vendor_name;
4619       break;
4620     case GOACC_PROPERTY_DRIVER:
4621       propval.ptr = hsa_context.driver_version_s;
4622       break;
4623     }
4624
4625   return propval;
4626 }
4627
4628 /* Set up plugin-specific thread-local-data (host-side).  */
4629
4630 void *
4631 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4632 {
4633   struct gcn_thread *thread_data
4634     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4635
4636   thread_data->async = GOMP_ASYNC_SYNC;
4637
4638   return (void *) thread_data;
4639 }
4640
4641 /* Clean up plugin-specific thread-local-data.  */
4642
4643 void
4644 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4645 {
4646   free (data);
4647 }
4648
4649 /* }}} */