libhsail-rt/rt/workitems.c

   1 /* workitems.c -- The main runtime entry that performs work-item execution in
   2    various ways and the builtin functions closely related to the
   3    implementation.
   4
   5    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   6    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   7    for General Processor Tech.
   8
   9    Permission is hereby granted, free of charge, to any person obtaining a
  10    copy of this software and associated documentation files
  11    (the "Software"), to deal in the Software without restriction, including
  12    without limitation the rights to use, copy, modify, merge, publish,
  13    distribute, sublicense, and/or sell copies of the Software, and to
  14    permit persons to whom the Software is furnished to do so, subject to
  15    the following conditions:
  16
  17    The above copyright notice and this permission notice shall be included
  18    in all copies or substantial portions of the Software.
  19
  20    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  24    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  25    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  26    USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 */
  28
  29 /* The fiber based multiple work-item work-group execution uses ucontext
  30    based user mode threading.  However, if gccbrig is able to optimize the
  31    kernel to a much faster work-group function that implements the multiple
  32    WI execution using loops instead of fibers requiring slow context switches,
  33    the fiber-based implementation won't be called.
  34  */
  35
  36 #include <stdlib.h>
  37 #include <signal.h>
  38 #include <string.h>
  39
  40 #include "workitems.h"
  41 #include "phsa-rt.h"
  42
  43 #ifdef HAVE_FIBERS
  44 #include "fibers.h"
  45 #endif
  46
  47 #ifdef BENCHMARK_PHSA_RT
  48 #include <stdio.h>
  49 #include <time.h>
  50
  51 static uint64_t wi_count = 0;
  52 static uint64_t wis_skipped = 0;
  53 static uint64_t wi_total = 0;
  54 static clock_t start_time;
  55
  56 #endif
  57
  58 #ifdef DEBUG_PHSA_RT
  59 #include <stdio.h>
  60 #endif
  61
  62 #define PRIVATE_SEGMENT_ALIGN 256
  63 #define FIBER_STACK_SIZE (64*1024)
  64 #define GROUP_SEGMENT_ALIGN 256
  65
  66 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
  67
  68 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
  69
  70 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
  71
  72 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
  73
  74 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
  75
  76 void
  77 phsa_fatal_error (int code)
  78 {
  79   exit (code);
  80 }
  81
  82 #ifdef HAVE_FIBERS
  83 /* ucontext-based work-item thread implementation.  Runs all work-items in
  84    separate fibers.  */
  85
  86 static void
  87 phsa_work_item_thread (int arg0, int arg1)
  88 {
  89   void *arg = fiber_int_args_to_ptr (arg0, arg1);
  90
  91   PHSAWorkItem *wi = (PHSAWorkItem *) arg;
  92   volatile PHSAWorkGroup *wg = wi->wg;
  93   PHSAKernelLaunchData *l_data = wi->launch_data;
  94
  95   do
  96     {
  97       int retcode
  98         = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
  99
 100       /* At this point the threads can assume that either more_wgs is 0 or
 101          the current_work_group_* is set to point to the WG executed next.  */
 102       if (!wi->wg->more_wgs)
 103         break;
 104 #ifdef DEBUG_PHSA_RT
 105       printf (
 106         "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
 107         wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
 108         l_data->wg_max_y, l_data->wg_max_z);
 109 #endif
 110
 111       if (wi->x < __hsail_currentworkgroupsize (0, wi)
 112           && wi->y < __hsail_currentworkgroupsize (1, wi)
 113           && wi->z < __hsail_currentworkgroupsize (2, wi))
 114         {
 115           l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
 116                           wg->private_base_ptr);
 117 #ifdef DEBUG_PHSA_RT
 118           printf ("done.\n");
 119 #endif
 120 #ifdef BENCHMARK_PHSA_RT
 121           wi_count++;
 122 #endif
 123         }
 124       else
 125         {
 126 #ifdef DEBUG_PHSA_RT
 127           printf ("skipped (partial WG).\n");
 128 #endif
 129 #ifdef BENCHMARK_PHSA_RT
 130           wis_skipped++;
 131 #endif
 132         }
 133
 134       retcode
 135         = fiber_barrier_reach ((fiber_barrier_t *)
 136                                l_data->wg_completion_barrier);
 137
 138       /* The first thread updates the WG to execute next etc.  */
 139
 140       if (retcode == 0)
 141         {
 142 #ifdef EXECUTE_WGS_BACKWARDS
 143           if (wg->x == l_data->wg_min_x)
 144             {
 145               wg->x = l_data->wg_max_x - 1;
 146               if (wg->y == l_data->wg_min_y)
 147                 {
 148                   wg->y = l_data->wg_max_y - 1;
 149                   if (wg->z == l_data->wg_min_z)
 150                     wg->more_wgs = 0;
 151                   else
 152                     wg->z--;
 153                 }
 154               else
 155                 wg->y--;
 156             }
 157           else
 158             wg->x--;
 159 #else
 160           if (wg->x + 1 >= l_data->wg_max_x)
 161             {
 162               wg->x = l_data->wg_min_x;
 163               if (wg->y + 1 >= l_data->wg_max_y)
 164                 {
 165                   wg->y = l_data->wg_min_y;
 166                   if (wg->z + 1 >= l_data->wg_max_z)
 167                     wg->more_wgs = 0;
 168                   else
 169                     wg->z++;
 170                 }
 171               else
 172                 wg->y++;
 173             }
 174           else
 175             wg->x++;
 176 #endif
 177
 178           /* Reinitialize the work-group barrier according to the new WG's
 179              size, which might not be the same as the previous ones, due
 180              to "partial WGs".  */
 181           size_t wg_size = __hsail_currentworkgroupsize (0, wi)
 182                            * __hsail_currentworkgroupsize (1, wi)
 183                            * __hsail_currentworkgroupsize (2, wi);
 184
 185 #ifdef DEBUG_PHSA_RT
 186           printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
 187 #endif
 188           fiber_barrier_init ((fiber_barrier_t *)
 189                               wi->launch_data->wg_sync_barrier,
 190                               wg_size);
 191
 192 #ifdef BENCHMARK_PHSA_RT
 193           if (wi_count % 1000 == 0)
 194             {
 195               clock_t spent_time = clock () - start_time;
 196               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 197               double wis_per_sec = wi_count / spent_time_sec;
 198               uint64_t eta_sec
 199                 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
 200
 201               printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
 202                       "%lu s)\n",
 203                       wi_count, wis_skipped, (uint64_t) spent_time_sec,
 204                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 205             }
 206 #endif
 207         }
 208     }
 209   while (1);
 210
 211   fiber_exit ();
 212 }
 213 #endif
 214
 215 #define MIN(a, b) ((a < b) ? a : b)
 216 #define MAX(a, b) ((a > b) ? a : b)
 217
 218 #ifdef HAVE_FIBERS
 219 /* Spawns a given number of work-items to execute a set of work-groups,
 220    blocks until their completion.  */
 221
 222 static void
 223 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
 224                       size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
 225 {
 226   PHSAWorkItem *wi_threads = NULL;
 227   PHSAWorkGroup wg;
 228   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
 229   fiber_barrier_t wg_start_barrier;
 230   fiber_barrier_t wg_completion_barrier;
 231   fiber_barrier_t wg_sync_barrier;
 232
 233   max_x = wg_size_x == 0 ? 1 : wg_size_x;
 234   max_y = wg_size_y == 0 ? 1 : wg_size_y;
 235   max_z = wg_size_z == 0 ? 1 : wg_size_z;
 236
 237   size_t wg_size = max_x * max_y * max_z;
 238   if (wg_size > PHSA_MAX_WG_SIZE)
 239     phsa_fatal_error (2);
 240
 241   wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
 242   if (wg.private_segment_total_size > 0
 243       && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 244                          wg.private_segment_total_size)
 245            != 0)
 246     phsa_fatal_error (3);
 247
 248   wg.alloca_stack_p = wg.private_segment_total_size;
 249   wg.alloca_frame_p = wg.alloca_stack_p;
 250
 251 #ifdef EXECUTE_WGS_BACKWARDS
 252   wg.x = context->wg_max_x - 1;
 253   wg.y = context->wg_max_y - 1;
 254   wg.z = context->wg_max_z - 1;
 255 #else
 256   wg.x = context->wg_min_x;
 257   wg.y = context->wg_min_y;
 258   wg.z = context->wg_min_z;
 259 #endif
 260
 261   fiber_barrier_init (&wg_sync_barrier, wg_size);
 262   fiber_barrier_init (&wg_start_barrier, wg_size);
 263   fiber_barrier_init (&wg_completion_barrier, wg_size);
 264
 265   context->wg_start_barrier = &wg_start_barrier;
 266   context->wg_sync_barrier = &wg_sync_barrier;
 267   context->wg_completion_barrier = &wg_completion_barrier;
 268
 269   wg.more_wgs = 1;
 270   wg.group_base_ptr = group_base_ptr;
 271
 272 #ifdef BENCHMARK_PHSA_RT
 273   wi_count = 0;
 274   wis_skipped = 0;
 275   start_time = clock ();
 276 #endif
 277   wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
 278   for (x = 0; x < max_x; ++x)
 279     for (y = 0; y < max_y; ++y)
 280       for (z = 0; z < max_z; ++z)
 281         {
 282           PHSAWorkItem *wi = &wi_threads[flat_wi_id];
 283           wi->launch_data = context;
 284           wi->wg = &wg;
 285           wi->x = x;
 286           wi->y = y;
 287           wi->z = z;
 288
 289           /* TODO: set the stack size according to the private
 290                    segment size.  Too big stack consumes huge amount of
 291                    memory in case of huge number of WIs and a too small stack
 292                    will fail in mysterious and potentially dangerous ways.  */
 293
 294           fiber_init (&wi->fiber, phsa_work_item_thread, wi,
 295                       FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
 296           ++flat_wi_id;
 297         }
 298
 299   do
 300     {
 301       --flat_wi_id;
 302       fiber_join (&wi_threads[flat_wi_id].fiber);
 303     }
 304   while (flat_wi_id > 0);
 305
 306   if (wg.private_segment_total_size > 0)
 307     free (wg.private_base_ptr);
 308
 309   free (wi_threads);
 310 }
 311
 312 /* Spawn the work-item threads to execute work-groups and let
 313    them execute all the WGs, including a potential partial WG.  */
 314
 315 static void
 316 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr)
 317 {
 318   hsa_kernel_dispatch_packet_t *dp = context->dp;
 319   size_t x, y, z;
 320
 321   /* TO DO: host-side memory management of group and private segment
 322      memory.  Agents in general are less likely to support efficient dynamic mem
 323      allocation.  */
 324   if (dp->group_segment_size > 0
 325       && posix_memalign (&group_base_ptr, PRIVATE_SEGMENT_ALIGN,
 326                          dp->group_segment_size) != 0)
 327     phsa_fatal_error (3);
 328
 329   context->group_segment_start_addr = (size_t) group_base_ptr;
 330
 331   /* HSA seems to allow the WG size to be larger than the grid size.  We need to
 332      saturate the effective WG size to the grid size to prevent the extra WIs
 333      from executing.  */
 334   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 335   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 336   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 337   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 338   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 339
 340 #ifdef BENCHMARK_PHSA_RT
 341   wi_total = (uint64_t) dp->grid_size_x
 342              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 343              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 344 #endif
 345
 346   /* For now execute all work groups in a single coarse thread (does not utilize
 347      multicore/multithread).  */
 348   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 349
 350   int dims = dp->setup & 0x3;
 351
 352   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 353                       / dp->workgroup_size_x;
 354
 355   context->wg_max_y
 356     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 357                        / dp->workgroup_size_y;
 358
 359   context->wg_max_z
 360     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 361                        / dp->workgroup_size_z;
 362
 363 #ifdef DEBUG_PHSA_RT
 364   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 365           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 366           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 367           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 368           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 369           dp->grid_size_y, dp->grid_size_z);
 370 #endif
 371
 372   phsa_execute_wi_gang (context, group_base_ptr, sat_wg_size_x, sat_wg_size_y,
 373                         sat_wg_size_z);
 374
 375   if (dp->group_segment_size > 0)
 376     free (group_base_ptr);
 377 }
 378 #endif
 379
 380 /* Executes the given work-group function for all work groups in the grid.
 381
 382    A work-group function is a version of the original kernel which executes
 383    the kernel for all work-items in a work-group.  It is produced by gccbrig
 384    if it can handle the kernel's barrier usage and is much faster way to
 385    execute massive numbers of work-items in a non-SPMD machine than fibers
 386    (easily 100x faster).  */
 387 static void
 388 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr)
 389 {
 390   hsa_kernel_dispatch_packet_t *dp = context->dp;
 391   size_t x, y, z, wg_x, wg_y, wg_z;
 392
 393   /* TODO: host-side memory management of group and private segment
 394      memory.  Agents in general are less likely to support efficient dynamic mem
 395      allocation.  */
 396   if (dp->group_segment_size > 0
 397       && posix_memalign (&group_base_ptr, GROUP_SEGMENT_ALIGN,
 398                          dp->group_segment_size) != 0)
 399     phsa_fatal_error (3);
 400
 401   context->group_segment_start_addr = (size_t) group_base_ptr;
 402
 403   /* HSA seems to allow the WG size to be larger than the grid size.  We need
 404      to saturate the effective WG size to the grid size to prevent the extra WIs
 405      from executing.  */
 406   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 407   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 408   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 409   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 410   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 411
 412 #ifdef BENCHMARK_PHSA_RT
 413   wi_total = (uint64_t) dp->grid_size_x
 414              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 415              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 416 #endif
 417
 418   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 419
 420   int dims = dp->setup & 0x3;
 421
 422   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 423                       / dp->workgroup_size_x;
 424
 425   context->wg_max_y
 426     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 427                        / dp->workgroup_size_y;
 428
 429   context->wg_max_z
 430     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 431                        / dp->workgroup_size_z;
 432
 433 #ifdef DEBUG_PHSA_RT
 434   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 435           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 436           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 437           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 438           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 439           dp->grid_size_y, dp->grid_size_z);
 440 #endif
 441
 442   PHSAWorkItem wi;
 443   PHSAWorkGroup wg;
 444   wi.wg = &wg;
 445   wi.x = wi.y = wi.z = 0;
 446   wi.launch_data = context;
 447
 448 #ifdef BENCHMARK_PHSA_RT
 449   start_time = clock ();
 450   uint64_t wg_count = 0;
 451 #endif
 452
 453   size_t wg_size = __hsail_workgroupsize (0, &wi)
 454                    * __hsail_workgroupsize (1, &wi)
 455                    * __hsail_workgroupsize (2, &wi);
 456
 457   void *private_base_ptr = NULL;
 458   if (dp->private_segment_size > 0
 459       && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 460                          dp->private_segment_size * wg_size)
 461            != 0)
 462     phsa_fatal_error (3);
 463
 464   wg.alloca_stack_p = dp->private_segment_size * wg_size;
 465   wg.alloca_frame_p = wg.alloca_stack_p;
 466
 467   wg.private_base_ptr = private_base_ptr;
 468   wg.group_base_ptr = group_base_ptr;
 469
 470 #ifdef DEBUG_PHSA_RT
 471   printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
 472           wg_size, private_base_ptr);
 473 #endif
 474
 475   for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
 476     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
 477       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
 478         {
 479           wi.wg->x = wg_x;
 480           wi.wg->y = wg_y;
 481           wi.wg->z = wg_z;
 482
 483           context->kernel (context->kernarg_addr, &wi, group_base_ptr,
 484                            private_base_ptr);
 485
 486 #if defined (BENCHMARK_PHSA_RT)
 487           wg_count++;
 488           if (wg_count % 1000000 == 0)
 489             {
 490               clock_t spent_time = clock () - start_time;
 491               uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
 492                                   + wg_z * sat_wg_size_z;
 493               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 494               double wis_per_sec = wi_count / spent_time_sec;
 495               uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
 496
 497               printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
 498                       wi_count, (uint64_t) spent_time_sec,
 499                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 500             }
 501 #endif
 502         }
 503
 504 #ifdef BENCHMARK_PHSA_RT
 505   clock_t spent_time = clock () - start_time;
 506   double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 507   double wis_per_sec = wi_total / spent_time_sec;
 508
 509   printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
 510           (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
 511 #endif
 512
 513   if (dp->group_segment_size > 0)
 514     free (group_base_ptr);
 515
 516   free (private_base_ptr);
 517   private_base_ptr = NULL;
 518 }
 519
 520 /* gccbrig generates the following from each HSAIL kernel:
 521
 522    1) The actual kernel function (a single work-item kernel or a work-group
 523       function) generated from HSAIL (BRIG).
 524
 525          static void _Kernel (void* args, void* context, void* group_base_ptr)
 526          {
 527            ...
 528          }
 529
 530   2) A public facing kernel function that is called from the PHSA runtime:
 531
 532    a) A single work-item function (that requires fibers for multi-WI):
 533
 534       void Kernel (void* context)
 535       {
 536          __launch_launch_kernel (_Kernel, context);
 537       }
 538
 539       or
 540
 541     b) a when gccbrig could generate a work-group function:
 542
 543       void Kernel (void* context)
 544       {
 545                 __hsail_launch_wg_function (_Kernel, context);
 546       }
 547 */
 548
 549 #ifdef HAVE_FIBERS
 550
 551 void
 552 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
 553                        void *group_base_ptr)
 554 {
 555   context->kernel = kernel;
 556   phsa_spawn_work_items (context, group_base_ptr);
 557 }
 558 #endif
 559
 560 void
 561 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
 562                             PHSAKernelLaunchData *context, void *group_base_ptr)
 563 {
 564   context->kernel = kernel;
 565   phsa_execute_work_groups (context, group_base_ptr);
 566 }
 567
 568 uint32_t
 569 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
 570 {
 571   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 572
 573   uint32_t id;
 574   switch (dim)
 575     {
 576     default:
 577     case 0:
 578       /* Overflow semantics in the case of WG dim > grid dim.  */
 579       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 580            % dp->grid_size_x;
 581       break;
 582     case 1:
 583       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 584            % dp->grid_size_y;
 585       break;
 586     case 2:
 587       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 588            % dp->grid_size_z;
 589       break;
 590     }
 591   return id;
 592 }
 593
 594 uint64_t
 595 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
 596 {
 597   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 598
 599   uint64_t id;
 600   switch (dim)
 601     {
 602     default:
 603     case 0:
 604       /* Overflow semantics in the case of WG dim > grid dim.  */
 605       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 606            % dp->grid_size_x;
 607       break;
 608     case 1:
 609       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 610            % dp->grid_size_y;
 611       break;
 612     case 2:
 613       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 614            % dp->grid_size_z;
 615       break;
 616     }
 617   return id;
 618 }
 619
 620
 621 uint32_t
 622 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
 623 {
 624   PHSAWorkItem *c = (PHSAWorkItem *) context;
 625   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 626
 627   /* The number of dimensions is in the two least significant bits.  */
 628   int dims = dp->setup & 0x3;
 629
 630   uint32_t id;
 631   switch (dim)
 632     {
 633     default:
 634     case 0:
 635       id = c->x;
 636       break;
 637     case 1:
 638       id = dims < 2 ? 0 : c->y;
 639       break;
 640     case 2:
 641       id = dims < 3 ? 0 : c->z;
 642       break;
 643     }
 644   return id;
 645 }
 646
 647 uint32_t
 648 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
 649 {
 650   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 651   int dims = dp->setup & 0x3;
 652
 653   uint32_t id;
 654   switch (dim)
 655     {
 656     default:
 657     case 0:
 658       id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
 659       break;
 660     case 1:
 661       id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
 662                             / dp->workgroup_size_y;
 663       break;
 664     case 2:
 665       id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
 666                             / dp->workgroup_size_z;
 667       break;
 668     }
 669   return id;
 670 }
 671
 672 uint32_t
 673 __hsail_workitemflatid (PHSAWorkItem *c)
 674 {
 675   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 676
 677   return c->x + c->y * dp->workgroup_size_x
 678          + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
 679 }
 680
 681 uint32_t
 682 __hsail_currentworkitemflatid (PHSAWorkItem *c)
 683 {
 684   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 685
 686   return c->x + c->y * __hsail_currentworkgroupsize (0, c)
 687          + c->z * __hsail_currentworkgroupsize (0, c)
 688              * __hsail_currentworkgroupsize (1, c);
 689 }
 690
 691 void
 692 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
 693 {
 694   switch (dim)
 695     {
 696     default:
 697     case 0:
 698       context->x = id;
 699       break;
 700     case 1:
 701       context->y = id;
 702       break;
 703     case 2:
 704       context->z = id;
 705       break;
 706     }
 707 }
 708
 709 uint64_t
 710 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
 711 {
 712   PHSAWorkItem *c = (PHSAWorkItem *) context;
 713   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 714
 715   /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 716   uint64_t id0 = __hsail_workitemabsid (0, context);
 717   uint64_t id1 = __hsail_workitemabsid (1, context);
 718   uint64_t id2 = __hsail_workitemabsid (2, context);
 719
 720   uint64_t max0 = dp->grid_size_x;
 721   uint64_t max1 = dp->grid_size_y;
 722   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 723
 724   return id;
 725 }
 726
 727 uint32_t
 728 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
 729 {
 730   PHSAWorkItem *c = (PHSAWorkItem *) context;
 731   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 732
 733   /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 734   uint64_t id0 = __hsail_workitemabsid (0, context);
 735   uint64_t id1 = __hsail_workitemabsid (1, context);
 736   uint64_t id2 = __hsail_workitemabsid (2, context);
 737
 738   uint64_t max0 = dp->grid_size_x;
 739   uint64_t max1 = dp->grid_size_y;
 740   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 741   return (uint32_t) id;
 742 }
 743
 744 uint32_t
 745 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
 746 {
 747   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 748   uint32_t wg_size = 0;
 749   switch (dim)
 750     {
 751     default:
 752     case 0:
 753       if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
 754         wg_size = dp->workgroup_size_x; /* Full WG.  */
 755       else
 756         wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
 757       break;
 758     case 1:
 759       if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
 760         wg_size = dp->workgroup_size_y; /* Full WG.  */
 761       else
 762         wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
 763       break;
 764     case 2:
 765       if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
 766         wg_size = dp->workgroup_size_z; /* Full WG.  */
 767       else
 768         wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
 769       break;
 770     }
 771   return wg_size;
 772 }
 773
 774 uint32_t
 775 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
 776 {
 777   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 778   switch (dim)
 779     {
 780     default:
 781     case 0:
 782       return dp->workgroup_size_x;
 783     case 1:
 784       return dp->workgroup_size_y;
 785     case 2:
 786       return dp->workgroup_size_z;
 787     }
 788 }
 789
 790 uint32_t
 791 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
 792 {
 793   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 794   switch (dim)
 795     {
 796     default:
 797     case 0:
 798       return dp->grid_size_x;
 799     case 1:
 800       return dp->grid_size_y;
 801     case 2:
 802       return dp->grid_size_z;
 803     }
 804 }
 805
 806 uint32_t
 807 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
 808 {
 809   switch (dim)
 810     {
 811     default:
 812     case 0:
 813       return wi->wg->x;
 814     case 1:
 815       return wi->wg->y;
 816     case 2:
 817       return wi->wg->z;
 818     }
 819 }
 820
 821 uint32_t
 822 __hsail_dim (PHSAWorkItem *wi)
 823 {
 824   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 825   return dp->setup & 0x3;
 826 }
 827
 828 uint64_t
 829 __hsail_packetid (PHSAWorkItem *wi)
 830 {
 831   return wi->launch_data->packet_id;
 832 }
 833
 834 uint32_t
 835 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
 836 {
 837   return (uint32_t) wi->launch_data->dp->completion_signal.handle;
 838 }
 839
 840 uint64_t
 841 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
 842 {
 843   return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
 844 }
 845
 846 #ifdef HAVE_FIBERS
 847 void
 848 __hsail_barrier (PHSAWorkItem *wi)
 849 {
 850   fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
 851 }
 852 #endif
 853
 854 /* Return a 32b private segment address that points to a dynamically
 855    allocated chunk of 'size' with 'align'.
 856
 857    Allocates the space from the end of the private segment allocated
 858    for the whole work group.  In implementations with separate private
 859    memories per WI, we will need to have a stack pointer per WI.  But in
 860    the current implementation, the segment is shared, so we possibly
 861    save some space in case all WIs do not call the alloca.
 862
 863    The "alloca frames" are organized as follows:
 864
 865    wg->alloca_stack_p points to the last allocated data (initially
 866    outside the private segment)
 867    wg->alloca_frame_p points to the first address _outside_ the current
 868    function's allocations (initially to the same as alloca_stack_p)
 869
 870    The data is allocated downwards from the end of the private segment.
 871
 872    In the beginning of a new function which has allocas, a new alloca
 873    frame is pushed which adds the current alloca_frame_p (the current
 874    function's frame starting point) to the top of the alloca stack and
 875    alloca_frame_p is set to the current stack position.
 876
 877    At the exit points of a function with allocas, the alloca frame
 878    is popped before returning.  This involves popping the alloca_frame_p
 879    to the one of the previous function in the call stack, and alloca_stack_p
 880    similarly, to the position of the last word alloca'd by the previous
 881    function.
 882  */
 883
 884 uint32_t
 885 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
 886 {
 887   volatile PHSAWorkGroup *wg = wi->wg;
 888   uint32_t new_pos = wg->alloca_stack_p - size;
 889   while (new_pos % align != 0)
 890     new_pos--;
 891   wg->alloca_stack_p = new_pos;
 892
 893 #ifdef DEBUG_ALLOCA
 894   printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
 895           wg->alloca_stack_p, wg->alloca_frame_p);
 896 #endif
 897   return new_pos;
 898 }
 899
 900 /* Initializes a new "alloca frame" in the private segment.
 901    This should be called at all the function entry points in case
 902    the function contains at least one call to alloca.  */
 903
 904 void
 905 __hsail_alloca_push_frame (PHSAWorkItem *wi)
 906 {
 907   volatile PHSAWorkGroup *wg = wi->wg;
 908
 909   /* Store the alloca_frame_p without any alignment padding so
 910      we know exactly where the previous frame ended after popping
 911      it.  */
 912 #ifdef DEBUG_ALLOCA
 913   printf ("--- push frame ");
 914 #endif
 915   uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
 916   memcpy (wg->private_base_ptr + last_word_offs,
 917           (const void *) &wg->alloca_frame_p, 4);
 918   wg->alloca_frame_p = last_word_offs;
 919
 920 #ifdef DEBUG_ALLOCA
 921   printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
 922 #endif
 923 }
 924
 925 /* Frees the current "alloca frame" and restores the frame
 926    pointer.
 927    This should be called at all the function return points in case
 928    the function contains at least one call to alloca.  Restores the
 929    alloca stack to the condition it was before pushing the frame
 930    the last time.  */
 931 void
 932 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
 933 {
 934   volatile PHSAWorkGroup *wg = wi->wg;
 935
 936   wg->alloca_stack_p = wg->alloca_frame_p;
 937   memcpy ((void *) &wg->alloca_frame_p,
 938           (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
 939   /* Now frame_p points to the beginning of the previous function's
 940      frame and stack_p to its end.  */
 941
 942   wg->alloca_stack_p += 4;
 943
 944 #ifdef DEBUG_ALLOCA
 945   printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
 946           wg->alloca_frame_p);
 947 #endif
 948 }