libhsail-rt/rt/workitems.c

   1 /* workitems.c -- The main runtime entry that performs work-item execution in
   2    various ways and the builtin functions closely related to the
   3    implementation.
   4
   5    Copyright (C) 2015-2018 Free Software Foundation, Inc.
   6    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   7    for General Processor Tech.
   8
   9    Permission is hereby granted, free of charge, to any person obtaining a
  10    copy of this software and associated documentation files
  11    (the "Software"), to deal in the Software without restriction, including
  12    without limitation the rights to use, copy, modify, merge, publish,
  13    distribute, sublicense, and/or sell copies of the Software, and to
  14    permit persons to whom the Software is furnished to do so, subject to
  15    the following conditions:
  16
  17    The above copyright notice and this permission notice shall be included
  18    in all copies or substantial portions of the Software.
  19
  20    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  24    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  25    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  26    USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 */
  28
  29 /* The fiber based multiple work-item work-group execution uses ucontext
  30    based user mode threading.  However, if gccbrig is able to optimize the
  31    kernel to a much faster work-group function that implements the multiple
  32    WI execution using loops instead of fibers requiring slow context switches,
  33    the fiber-based implementation won't be called.
  34  */
  35
  36 #include <stdlib.h>
  37 #include <signal.h>
  38 #include <string.h>
  39
  40 #include "workitems.h"
  41 #include "phsa-rt.h"
  42
  43 #ifdef HAVE_FIBERS
  44 #include "fibers.h"
  45 #endif
  46
  47 #ifdef BENCHMARK_PHSA_RT
  48 #include <stdio.h>
  49 #include <time.h>
  50
  51 static uint64_t wi_count = 0;
  52 static uint64_t wis_skipped = 0;
  53 static uint64_t wi_total = 0;
  54 static clock_t start_time;
  55
  56 #endif
  57
  58 #ifdef DEBUG_PHSA_RT
  59 #include <stdio.h>
  60 #endif
  61
  62 #define PRIVATE_SEGMENT_ALIGN 256
  63 #define FIBER_STACK_SIZE (64*1024)
  64 #define GROUP_SEGMENT_ALIGN 256
  65
  66 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
  67
  68 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
  69
  70 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
  71
  72 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
  73
  74 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
  75
  76 void
  77 phsa_fatal_error (int code)
  78 {
  79   exit (code);
  80 }
  81
  82 #ifdef HAVE_FIBERS
  83 /* ucontext-based work-item thread implementation.  Runs all work-items in
  84    separate fibers.  */
  85
  86 static void
  87 phsa_work_item_thread (int arg0, int arg1)
  88 {
  89   void *arg = fiber_int_args_to_ptr (arg0, arg1);
  90
  91   PHSAWorkItem *wi = (PHSAWorkItem *) arg;
  92   volatile PHSAWorkGroup *wg = wi->wg;
  93   PHSAKernelLaunchData *l_data = wi->launch_data;
  94
  95   do
  96     {
  97       int retcode
  98         = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
  99
 100       /* At this point the threads can assume that either more_wgs is 0 or
 101          the current_work_group_* is set to point to the WG executed next.  */
 102       if (!wi->wg->more_wgs)
 103         break;
 104 #ifdef DEBUG_PHSA_RT
 105       printf (
 106         "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
 107         wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
 108         l_data->wg_max_y, l_data->wg_max_z);
 109 #endif
 110
 111       if (wi->x < __hsail_currentworkgroupsize (0, wi)
 112           && wi->y < __hsail_currentworkgroupsize (1, wi)
 113           && wi->z < __hsail_currentworkgroupsize (2, wi))
 114         {
 115           l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
 116                           wg->initial_group_offset, wg->private_base_ptr);
 117 #ifdef DEBUG_PHSA_RT
 118           printf ("done.\n");
 119 #endif
 120 #ifdef BENCHMARK_PHSA_RT
 121           wi_count++;
 122 #endif
 123         }
 124       else
 125         {
 126 #ifdef DEBUG_PHSA_RT
 127           printf ("skipped (partial WG).\n");
 128 #endif
 129 #ifdef BENCHMARK_PHSA_RT
 130           wis_skipped++;
 131 #endif
 132         }
 133
 134       retcode
 135         = fiber_barrier_reach ((fiber_barrier_t *)
 136                                l_data->wg_completion_barrier);
 137
 138       /* The first thread updates the WG to execute next etc.  */
 139
 140       if (retcode == 0)
 141         {
 142 #ifdef EXECUTE_WGS_BACKWARDS
 143           if (wg->x == l_data->wg_min_x)
 144             {
 145               wg->x = l_data->wg_max_x - 1;
 146               if (wg->y == l_data->wg_min_y)
 147                 {
 148                   wg->y = l_data->wg_max_y - 1;
 149                   if (wg->z == l_data->wg_min_z)
 150                     wg->more_wgs = 0;
 151                   else
 152                     wg->z--;
 153                 }
 154               else
 155                 wg->y--;
 156             }
 157           else
 158             wg->x--;
 159 #else
 160           if (wg->x + 1 >= l_data->wg_max_x)
 161             {
 162               wg->x = l_data->wg_min_x;
 163               if (wg->y + 1 >= l_data->wg_max_y)
 164                 {
 165                   wg->y = l_data->wg_min_y;
 166                   if (wg->z + 1 >= l_data->wg_max_z)
 167                     wg->more_wgs = 0;
 168                   else
 169                     wg->z++;
 170                 }
 171               else
 172                 wg->y++;
 173             }
 174           else
 175             wg->x++;
 176 #endif
 177
 178           /* Reinitialize the work-group barrier according to the new WG's
 179              size, which might not be the same as the previous ones, due
 180              to "partial WGs".  */
 181           size_t wg_size = __hsail_currentworkgroupsize (0, wi)
 182                            * __hsail_currentworkgroupsize (1, wi)
 183                            * __hsail_currentworkgroupsize (2, wi);
 184
 185 #ifdef DEBUG_PHSA_RT
 186           printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
 187 #endif
 188           fiber_barrier_init ((fiber_barrier_t *)
 189                               wi->launch_data->wg_sync_barrier,
 190                               wg_size);
 191
 192 #ifdef BENCHMARK_PHSA_RT
 193           if (wi_count % 1000 == 0)
 194             {
 195               clock_t spent_time = clock () - start_time;
 196               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 197               double wis_per_sec = wi_count / spent_time_sec;
 198               uint64_t eta_sec
 199                 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
 200
 201               printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
 202                       "%lu s)\n",
 203                       wi_count, wis_skipped, (uint64_t) spent_time_sec,
 204                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 205             }
 206 #endif
 207         }
 208     }
 209   while (1);
 210
 211   fiber_exit ();
 212 }
 213 #endif
 214
 215 #define MIN(a, b) ((a < b) ? a : b)
 216 #define MAX(a, b) ((a > b) ? a : b)
 217
 218 #ifdef HAVE_FIBERS
 219 /* Spawns a given number of work-items to execute a set of work-groups,
 220    blocks until their completion.  */
 221
 222 static void
 223 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
 224                       uint32_t group_local_offset, size_t wg_size_x,
 225                       size_t wg_size_y, size_t wg_size_z)
 226 {
 227   PHSAWorkItem *wi_threads = NULL;
 228   PHSAWorkGroup wg;
 229   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
 230   fiber_barrier_t wg_start_barrier;
 231   fiber_barrier_t wg_completion_barrier;
 232   fiber_barrier_t wg_sync_barrier;
 233
 234   max_x = wg_size_x == 0 ? 1 : wg_size_x;
 235   max_y = wg_size_y == 0 ? 1 : wg_size_y;
 236   max_z = wg_size_z == 0 ? 1 : wg_size_z;
 237
 238   size_t wg_size = max_x * max_y * max_z;
 239   if (wg_size > PHSA_MAX_WG_SIZE)
 240     phsa_fatal_error (2);
 241
 242   wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
 243   if (wg.private_segment_total_size > 0
 244       && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 245                          wg.private_segment_total_size)
 246            != 0)
 247     phsa_fatal_error (3);
 248
 249   wg.alloca_stack_p = wg.private_segment_total_size;
 250   wg.alloca_frame_p = wg.alloca_stack_p;
 251   wg.initial_group_offset = group_local_offset;
 252
 253 #ifdef EXECUTE_WGS_BACKWARDS
 254   wg.x = context->wg_max_x - 1;
 255   wg.y = context->wg_max_y - 1;
 256   wg.z = context->wg_max_z - 1;
 257 #else
 258   wg.x = context->wg_min_x;
 259   wg.y = context->wg_min_y;
 260   wg.z = context->wg_min_z;
 261 #endif
 262
 263   fiber_barrier_init (&wg_sync_barrier, wg_size);
 264   fiber_barrier_init (&wg_start_barrier, wg_size);
 265   fiber_barrier_init (&wg_completion_barrier, wg_size);
 266
 267   context->wg_start_barrier = &wg_start_barrier;
 268   context->wg_sync_barrier = &wg_sync_barrier;
 269   context->wg_completion_barrier = &wg_completion_barrier;
 270
 271   wg.more_wgs = 1;
 272   wg.group_base_ptr = group_base_ptr;
 273
 274 #ifdef BENCHMARK_PHSA_RT
 275   wi_count = 0;
 276   wis_skipped = 0;
 277   start_time = clock ();
 278 #endif
 279   wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
 280   for (x = 0; x < max_x; ++x)
 281     for (y = 0; y < max_y; ++y)
 282       for (z = 0; z < max_z; ++z)
 283         {
 284           PHSAWorkItem *wi = &wi_threads[flat_wi_id];
 285           wi->launch_data = context;
 286           wi->wg = &wg;
 287           wi->x = x;
 288           wi->y = y;
 289           wi->z = z;
 290
 291           /* TODO: set the stack size according to the private
 292                    segment size.  Too big stack consumes huge amount of
 293                    memory in case of huge number of WIs and a too small stack
 294                    will fail in mysterious and potentially dangerous ways.  */
 295
 296           fiber_init (&wi->fiber, phsa_work_item_thread, wi,
 297                       FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
 298           ++flat_wi_id;
 299         }
 300
 301   do
 302     {
 303       --flat_wi_id;
 304       fiber_join (&wi_threads[flat_wi_id].fiber);
 305     }
 306   while (flat_wi_id > 0);
 307
 308   if (wg.private_segment_total_size > 0)
 309     free (wg.private_base_ptr);
 310
 311   free (wi_threads);
 312 }
 313
 314 /* Spawn the work-item threads to execute work-groups and let
 315    them execute all the WGs, including a potential partial WG.  */
 316
 317 static void
 318 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
 319                        uint32_t group_local_offset)
 320 {
 321   hsa_kernel_dispatch_packet_t *dp = context->dp;
 322   size_t x, y, z;
 323
 324   context->group_segment_start_addr = (size_t) group_base_ptr;
 325
 326   /* HSA seems to allow the WG size to be larger than the grid size.  We need to
 327      saturate the effective WG size to the grid size to prevent the extra WIs
 328      from executing.  */
 329   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 330   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 331   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 332   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 333   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 334
 335 #ifdef BENCHMARK_PHSA_RT
 336   wi_total = (uint64_t) dp->grid_size_x
 337              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 338              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 339 #endif
 340
 341   /* For now execute all work groups in a single coarse thread (does not utilize
 342      multicore/multithread).  */
 343   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 344
 345   int dims = dp->setup & 0x3;
 346
 347   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 348                       / dp->workgroup_size_x;
 349
 350   context->wg_max_y
 351     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 352                        / dp->workgroup_size_y;
 353
 354   context->wg_max_z
 355     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 356                        / dp->workgroup_size_z;
 357
 358 #ifdef DEBUG_PHSA_RT
 359   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 360           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 361           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 362           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 363           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 364           dp->grid_size_y, dp->grid_size_z);
 365 #endif
 366
 367   phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
 368                         sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
 369 }
 370 #endif
 371
 372 /* Executes the given work-group function for all work groups in the grid.
 373
 374    A work-group function is a version of the original kernel which executes
 375    the kernel for all work-items in a work-group.  It is produced by gccbrig
 376    if it can handle the kernel's barrier usage and is much faster way to
 377    execute massive numbers of work-items in a non-SPMD machine than fibers
 378    (easily 100x faster).  */
 379 static void
 380 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
 381                           uint32_t group_local_offset)
 382 {
 383   hsa_kernel_dispatch_packet_t *dp = context->dp;
 384   size_t x, y, z, wg_x, wg_y, wg_z;
 385
 386   context->group_segment_start_addr = (size_t) group_base_ptr;
 387
 388   /* HSA seems to allow the WG size to be larger than the grid size.  We need
 389      to saturate the effective WG size to the grid size to prevent the extra WIs
 390      from executing.  */
 391   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 392   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 393   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 394   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 395   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 396
 397 #ifdef BENCHMARK_PHSA_RT
 398   wi_total = (uint64_t) dp->grid_size_x
 399              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 400              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 401 #endif
 402
 403   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 404
 405   int dims = dp->setup & 0x3;
 406
 407   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 408                       / dp->workgroup_size_x;
 409
 410   context->wg_max_y
 411     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 412                        / dp->workgroup_size_y;
 413
 414   context->wg_max_z
 415     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 416                        / dp->workgroup_size_z;
 417
 418 #ifdef DEBUG_PHSA_RT
 419   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 420           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 421           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 422           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 423           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 424           dp->grid_size_y, dp->grid_size_z);
 425 #endif
 426
 427   PHSAWorkItem wi;
 428   PHSAWorkGroup wg;
 429   wi.wg = &wg;
 430   wi.x = wi.y = wi.z = 0;
 431   wi.launch_data = context;
 432
 433 #ifdef BENCHMARK_PHSA_RT
 434   start_time = clock ();
 435   uint64_t wg_count = 0;
 436 #endif
 437
 438   size_t wg_size = __hsail_workgroupsize (0, &wi)
 439                    * __hsail_workgroupsize (1, &wi)
 440                    * __hsail_workgroupsize (2, &wi);
 441
 442   void *private_base_ptr = NULL;
 443   if (dp->private_segment_size > 0
 444       && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 445                          dp->private_segment_size * wg_size)
 446            != 0)
 447     phsa_fatal_error (3);
 448
 449   wg.alloca_stack_p = dp->private_segment_size * wg_size;
 450   wg.alloca_frame_p = wg.alloca_stack_p;
 451
 452   wg.private_base_ptr = private_base_ptr;
 453   wg.group_base_ptr = group_base_ptr;
 454
 455 #ifdef DEBUG_PHSA_RT
 456   printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
 457           wg_size, private_base_ptr);
 458 #endif
 459
 460   for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
 461     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
 462       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
 463         {
 464           wi.wg->x = wg_x;
 465           wi.wg->y = wg_y;
 466           wi.wg->z = wg_z;
 467
 468           context->kernel (context->kernarg_addr, &wi, group_base_ptr,
 469                            group_local_offset, private_base_ptr);
 470
 471 #if defined (BENCHMARK_PHSA_RT)
 472           wg_count++;
 473           if (wg_count % 1000000 == 0)
 474             {
 475               clock_t spent_time = clock () - start_time;
 476               uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
 477                                   + wg_z * sat_wg_size_z;
 478               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 479               double wis_per_sec = wi_count / spent_time_sec;
 480               uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
 481
 482               printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
 483                       wi_count, (uint64_t) spent_time_sec,
 484                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 485             }
 486 #endif
 487         }
 488
 489 #ifdef BENCHMARK_PHSA_RT
 490   clock_t spent_time = clock () - start_time;
 491   double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 492   double wis_per_sec = wi_total / spent_time_sec;
 493
 494   printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
 495           (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
 496 #endif
 497   free (private_base_ptr);
 498   private_base_ptr = NULL;
 499 }
 500
 501 /* gccbrig generates the following from each HSAIL kernel:
 502
 503    1) The actual kernel function (a single work-item kernel or a work-group
 504       function) generated from HSAIL (BRIG).
 505
 506          static void _Kernel (void* args, void* context, void* group_base_ptr)
 507          {
 508            ...
 509          }
 510
 511   2) A public facing kernel function that is called from the PHSA runtime:
 512
 513    a) A single work-item function (that requires fibers for multi-WI):
 514
 515       void Kernel (void* context)
 516       {
 517          __launch_launch_kernel (_Kernel, context);
 518       }
 519
 520       or
 521
 522     b) a when gccbrig could generate a work-group function:
 523
 524       void Kernel (void* context)
 525       {
 526                 __hsail_launch_wg_function (_Kernel, context);
 527       }
 528 */
 529
 530 #ifdef HAVE_FIBERS
 531
 532 void
 533 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
 534                        void *group_base_ptr, uint32_t group_local_offset)
 535 {
 536   context->kernel = kernel;
 537   phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
 538 }
 539 #endif
 540
 541 void
 542 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
 543                             PHSAKernelLaunchData *context, void *group_base_ptr,
 544                             uint32_t group_local_offset)
 545 {
 546   context->kernel = kernel;
 547   phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
 548 }
 549
 550 uint32_t
 551 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
 552 {
 553   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 554
 555   uint32_t id;
 556   switch (dim)
 557     {
 558     default:
 559     case 0:
 560       /* Overflow semantics in the case of WG dim > grid dim.  */
 561       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 562            % dp->grid_size_x;
 563       break;
 564     case 1:
 565       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 566            % dp->grid_size_y;
 567       break;
 568     case 2:
 569       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 570            % dp->grid_size_z;
 571       break;
 572     }
 573   return id;
 574 }
 575
 576 uint64_t
 577 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
 578 {
 579   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 580
 581   uint64_t id;
 582   switch (dim)
 583     {
 584     default:
 585     case 0:
 586       /* Overflow semantics in the case of WG dim > grid dim.  */
 587       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 588            % dp->grid_size_x;
 589       break;
 590     case 1:
 591       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 592            % dp->grid_size_y;
 593       break;
 594     case 2:
 595       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 596            % dp->grid_size_z;
 597       break;
 598     }
 599   return id;
 600 }
 601
 602
 603 uint32_t
 604 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
 605 {
 606   PHSAWorkItem *c = (PHSAWorkItem *) context;
 607   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 608
 609   /* The number of dimensions is in the two least significant bits.  */
 610   int dims = dp->setup & 0x3;
 611
 612   uint32_t id;
 613   switch (dim)
 614     {
 615     default:
 616     case 0:
 617       id = c->x;
 618       break;
 619     case 1:
 620       id = dims < 2 ? 0 : c->y;
 621       break;
 622     case 2:
 623       id = dims < 3 ? 0 : c->z;
 624       break;
 625     }
 626   return id;
 627 }
 628
 629 uint32_t
 630 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
 631 {
 632   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 633   int dims = dp->setup & 0x3;
 634
 635   uint32_t id;
 636   switch (dim)
 637     {
 638     default:
 639     case 0:
 640       id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
 641       break;
 642     case 1:
 643       id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
 644                             / dp->workgroup_size_y;
 645       break;
 646     case 2:
 647       id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
 648                             / dp->workgroup_size_z;
 649       break;
 650     }
 651   return id;
 652 }
 653
 654 uint32_t
 655 __hsail_workitemflatid (PHSAWorkItem *c)
 656 {
 657   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 658
 659   return c->x + c->y * dp->workgroup_size_x
 660          + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
 661 }
 662
 663 uint32_t
 664 __hsail_currentworkitemflatid (PHSAWorkItem *c)
 665 {
 666   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 667
 668   return c->x + c->y * __hsail_currentworkgroupsize (0, c)
 669          + c->z * __hsail_currentworkgroupsize (0, c)
 670              * __hsail_currentworkgroupsize (1, c);
 671 }
 672
 673 void
 674 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
 675 {
 676   switch (dim)
 677     {
 678     default:
 679     case 0:
 680       context->x = id;
 681       break;
 682     case 1:
 683       context->y = id;
 684       break;
 685     case 2:
 686       context->z = id;
 687       break;
 688     }
 689 }
 690
 691 uint64_t
 692 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
 693 {
 694   PHSAWorkItem *c = (PHSAWorkItem *) context;
 695   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 696
 697   /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 698   uint64_t id0 = __hsail_workitemabsid (0, context);
 699   uint64_t id1 = __hsail_workitemabsid (1, context);
 700   uint64_t id2 = __hsail_workitemabsid (2, context);
 701
 702   uint64_t max0 = dp->grid_size_x;
 703   uint64_t max1 = dp->grid_size_y;
 704   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 705
 706   return id;
 707 }
 708
 709 uint32_t
 710 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
 711 {
 712   PHSAWorkItem *c = (PHSAWorkItem *) context;
 713   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 714
 715   /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 716   uint64_t id0 = __hsail_workitemabsid (0, context);
 717   uint64_t id1 = __hsail_workitemabsid (1, context);
 718   uint64_t id2 = __hsail_workitemabsid (2, context);
 719
 720   uint64_t max0 = dp->grid_size_x;
 721   uint64_t max1 = dp->grid_size_y;
 722   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 723   return (uint32_t) id;
 724 }
 725
 726 uint32_t
 727 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
 728 {
 729   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 730   uint32_t wg_size = 0;
 731   switch (dim)
 732     {
 733     default:
 734     case 0:
 735       if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
 736         wg_size = dp->workgroup_size_x; /* Full WG.  */
 737       else
 738         wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
 739       break;
 740     case 1:
 741       if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
 742         wg_size = dp->workgroup_size_y; /* Full WG.  */
 743       else
 744         wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
 745       break;
 746     case 2:
 747       if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
 748         wg_size = dp->workgroup_size_z; /* Full WG.  */
 749       else
 750         wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
 751       break;
 752     }
 753   return wg_size;
 754 }
 755
 756 uint32_t
 757 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
 758 {
 759   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 760   switch (dim)
 761     {
 762     default:
 763     case 0:
 764       return dp->workgroup_size_x;
 765     case 1:
 766       return dp->workgroup_size_y;
 767     case 2:
 768       return dp->workgroup_size_z;
 769     }
 770 }
 771
 772 uint32_t
 773 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
 774 {
 775   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 776   switch (dim)
 777     {
 778     default:
 779     case 0:
 780       return dp->grid_size_x;
 781     case 1:
 782       return dp->grid_size_y;
 783     case 2:
 784       return dp->grid_size_z;
 785     }
 786 }
 787
 788 uint32_t
 789 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
 790 {
 791   switch (dim)
 792     {
 793     default:
 794     case 0:
 795       return wi->wg->x;
 796     case 1:
 797       return wi->wg->y;
 798     case 2:
 799       return wi->wg->z;
 800     }
 801 }
 802
 803 uint32_t
 804 __hsail_dim (PHSAWorkItem *wi)
 805 {
 806   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 807   return dp->setup & 0x3;
 808 }
 809
 810 uint64_t
 811 __hsail_packetid (PHSAWorkItem *wi)
 812 {
 813   return wi->launch_data->packet_id;
 814 }
 815
 816 uint32_t
 817 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
 818 {
 819   return (uint32_t) wi->launch_data->dp->completion_signal.handle;
 820 }
 821
 822 uint64_t
 823 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
 824 {
 825   return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
 826 }
 827
 828 #ifdef HAVE_FIBERS
 829 void
 830 __hsail_barrier (PHSAWorkItem *wi)
 831 {
 832   fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
 833 }
 834 #endif
 835
 836 /* Return a 32b private segment address that points to a dynamically
 837    allocated chunk of 'size' with 'align'.
 838
 839    Allocates the space from the end of the private segment allocated
 840    for the whole work group.  In implementations with separate private
 841    memories per WI, we will need to have a stack pointer per WI.  But in
 842    the current implementation, the segment is shared, so we possibly
 843    save some space in case all WIs do not call the alloca.
 844
 845    The "alloca frames" are organized as follows:
 846
 847    wg->alloca_stack_p points to the last allocated data (initially
 848    outside the private segment)
 849    wg->alloca_frame_p points to the first address _outside_ the current
 850    function's allocations (initially to the same as alloca_stack_p)
 851
 852    The data is allocated downwards from the end of the private segment.
 853
 854    In the beginning of a new function which has allocas, a new alloca
 855    frame is pushed which adds the current alloca_frame_p (the current
 856    function's frame starting point) to the top of the alloca stack and
 857    alloca_frame_p is set to the current stack position.
 858
 859    At the exit points of a function with allocas, the alloca frame
 860    is popped before returning.  This involves popping the alloca_frame_p
 861    to the one of the previous function in the call stack, and alloca_stack_p
 862    similarly, to the position of the last word alloca'd by the previous
 863    function.
 864  */
 865
 866 uint32_t
 867 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
 868 {
 869   volatile PHSAWorkGroup *wg = wi->wg;
 870   uint32_t new_pos = wg->alloca_stack_p - size;
 871   while (new_pos % align != 0)
 872     new_pos--;
 873   wg->alloca_stack_p = new_pos;
 874
 875 #ifdef DEBUG_ALLOCA
 876   printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
 877           wg->alloca_stack_p, wg->alloca_frame_p);
 878 #endif
 879   return new_pos;
 880 }
 881
 882 /* Initializes a new "alloca frame" in the private segment.
 883    This should be called at all the function entry points in case
 884    the function contains at least one call to alloca.  */
 885
 886 void
 887 __hsail_alloca_push_frame (PHSAWorkItem *wi)
 888 {
 889   volatile PHSAWorkGroup *wg = wi->wg;
 890
 891   /* Store the alloca_frame_p without any alignment padding so
 892      we know exactly where the previous frame ended after popping
 893      it.  */
 894 #ifdef DEBUG_ALLOCA
 895   printf ("--- push frame ");
 896 #endif
 897   uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
 898   memcpy (wg->private_base_ptr + last_word_offs,
 899           (const void *) &wg->alloca_frame_p, 4);
 900   wg->alloca_frame_p = last_word_offs;
 901
 902 #ifdef DEBUG_ALLOCA
 903   printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
 904 #endif
 905 }
 906
 907 /* Frees the current "alloca frame" and restores the frame
 908    pointer.
 909    This should be called at all the function return points in case
 910    the function contains at least one call to alloca.  Restores the
 911    alloca stack to the condition it was before pushing the frame
 912    the last time.  */
 913 void
 914 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
 915 {
 916   volatile PHSAWorkGroup *wg = wi->wg;
 917
 918   wg->alloca_stack_p = wg->alloca_frame_p;
 919   memcpy ((void *) &wg->alloca_frame_p,
 920           (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
 921   /* Now frame_p points to the beginning of the previous function's
 922      frame and stack_p to its end.  */
 923
 924   wg->alloca_stack_p += 4;
 925
 926 #ifdef DEBUG_ALLOCA
 927   printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
 928           wg->alloca_frame_p);
 929 #endif
 930 }