libhsail-rt/rt/workitems.c

   1 /* workitems.c -- The main runtime entry that performs work-item execution in
   2    various ways and the builtin functions closely related to the
   3    implementation.
   4
   5    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   6    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   7    for General Processor Tech.
   8
   9    Permission is hereby granted, free of charge, to any person obtaining a
  10    copy of this software and associated documentation files
  11    (the "Software"), to deal in the Software without restriction, including
  12    without limitation the rights to use, copy, modify, merge, publish,
  13    distribute, sublicense, and/or sell copies of the Software, and to
  14    permit persons to whom the Software is furnished to do so, subject to
  15    the following conditions:
  16
  17    The above copyright notice and this permission notice shall be included
  18    in all copies or substantial portions of the Software.
  19
  20    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  24    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  25    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  26    USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 */
  28
  29 /* The fiber based multiple work-item work-group execution uses ucontext
  30    based user mode threading.  However, if gccbrig is able to optimize the
  31    kernel to a much faster work-group function that implements the multiple
  32    WI execution using loops instead of fibers requiring slow context switches,
  33    the fiber-based implementation won't be called.
  34  */
  35
  36 #include <stdlib.h>
  37 #include <signal.h>
  38 #include <string.h>
  39
  40 #include "workitems.h"
  41 #include "phsa-rt.h"
  42
  43 #ifdef HAVE_FIBERS
  44 #include "fibers.h"
  45 #endif
  46
  47 #ifdef BENCHMARK_PHSA_RT
  48 #include <stdio.h>
  49 #include <time.h>
  50
  51 static uint64_t wi_count = 0;
  52 static uint64_t wis_skipped = 0;
  53 static uint64_t wi_total = 0;
  54 static clock_t start_time;
  55
  56 #endif
  57
  58 #ifdef DEBUG_PHSA_RT
  59 #include <stdio.h>
  60 #endif
  61
  62 #define PRIVATE_SEGMENT_ALIGN 256
  63 #define FIBER_STACK_SIZE (64*1024)
  64 #define GROUP_SEGMENT_ALIGN 256
  65
  66 /* HSA requires WGs to be executed in flat work-group id order.  Enabling
  67    the following macro can reveal test cases that rely on the ordering,
  68    but is not useful for much else.  */
  69
  70 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
  71
  72 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
  73
  74 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
  75
  76 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
  77
  78 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
  79
  80 void
  81 phsa_fatal_error (int code)
  82 {
  83   exit (code);
  84 }
  85
  86 #ifdef HAVE_FIBERS
  87 /* ucontext-based work-item thread implementation.  Runs all work-items in
  88    separate fibers.  */
  89
  90 static void
  91 phsa_work_item_thread (int arg0, int arg1)
  92 {
  93   void *arg = fiber_int_args_to_ptr (arg0, arg1);
  94
  95   PHSAWorkItem *wi = (PHSAWorkItem *) arg;
  96   volatile PHSAWorkGroup *wg = wi->wg;
  97   PHSAKernelLaunchData *l_data = wi->launch_data;
  98
  99   do
 100     {
 101       int retcode
 102         = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
 103
 104       /* At this point the threads can assume that either more_wgs is 0 or
 105          the current_work_group_* is set to point to the WG executed next.  */
 106       if (!wi->wg->more_wgs)
 107         break;
 108 #ifdef DEBUG_PHSA_RT
 109       printf (
 110         "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
 111         wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
 112         l_data->wg_max_y, l_data->wg_max_z);
 113 #endif
 114
 115       if (wi->x < __hsail_currentworkgroupsize (0, wi)
 116           && wi->y < __hsail_currentworkgroupsize (1, wi)
 117           && wi->z < __hsail_currentworkgroupsize (2, wi))
 118         {
 119           l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
 120                           wg->private_base_ptr);
 121 #ifdef DEBUG_PHSA_RT
 122           printf ("done.\n");
 123 #endif
 124 #ifdef BENCHMARK_PHSA_RT
 125           wi_count++;
 126 #endif
 127         }
 128       else
 129         {
 130 #ifdef DEBUG_PHSA_RT
 131           printf ("skipped (partial WG).\n");
 132 #endif
 133 #ifdef BENCHMARK_PHSA_RT
 134           wis_skipped++;
 135 #endif
 136         }
 137
 138       retcode
 139         = fiber_barrier_reach ((fiber_barrier_t *)
 140                                l_data->wg_completion_barrier);
 141
 142       /* The first thread updates the WG to execute next etc.  */
 143
 144       if (retcode == 0)
 145         {
 146 #ifdef EXECUTE_WGS_BACKWARDS
 147           if (wg->x == l_data->wg_min_x)
 148             {
 149               wg->x = l_data->wg_max_x - 1;
 150               if (wg->y == l_data->wg_min_y)
 151                 {
 152                   wg->y = l_data->wg_max_y - 1;
 153                   if (wg->z == l_data->wg_min_z)
 154                     wg->more_wgs = 0;
 155                   else
 156                     wg->z--;
 157                 }
 158               else
 159                 wg->y--;
 160             }
 161           else
 162             wg->x--;
 163 #else
 164           if (wg->x + 1 >= l_data->wg_max_x)
 165             {
 166               wg->x = l_data->wg_min_x;
 167               if (wg->y + 1 >= l_data->wg_max_y)
 168                 {
 169                   wg->y = l_data->wg_min_y;
 170                   if (wg->z + 1 >= l_data->wg_max_z)
 171                     wg->more_wgs = 0;
 172                   else
 173                     wg->z++;
 174                 }
 175               else
 176                 wg->y++;
 177             }
 178           else
 179             wg->x++;
 180 #endif
 181
 182           /* Reinitialize the work-group barrier according to the new WG's
 183              size, which might not be the same as the previous ones, due
 184              to "partial WGs".  */
 185           size_t wg_size = __hsail_currentworkgroupsize (0, wi)
 186                            * __hsail_currentworkgroupsize (1, wi)
 187                            * __hsail_currentworkgroupsize (2, wi);
 188
 189 #ifdef DEBUG_PHSA_RT
 190           printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
 191 #endif
 192           fiber_barrier_init ((fiber_barrier_t *)
 193                               wi->launch_data->wg_sync_barrier,
 194                               wg_size);
 195
 196 #ifdef BENCHMARK_PHSA_RT
 197           if (wi_count % 1000 == 0)
 198             {
 199               clock_t spent_time = clock () - start_time;
 200               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 201               double wis_per_sec = wi_count / spent_time_sec;
 202               uint64_t eta_sec
 203                 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
 204
 205               printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
 206                       "%lu s)\n",
 207                       wi_count, wis_skipped, (uint64_t) spent_time_sec,
 208                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 209             }
 210 #endif
 211         }
 212     }
 213   while (1);
 214
 215   fiber_exit ();
 216 }
 217 #endif
 218
 219 #define MIN(a, b) ((a < b) ? a : b)
 220 #define MAX(a, b) ((a > b) ? a : b)
 221
 222 #ifdef HAVE_FIBERS
 223 /* Spawns a given number of work-items to execute a set of work-groups,
 224    blocks until their completion.  */
 225
 226 static void
 227 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
 228                       size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
 229 {
 230   PHSAWorkItem *wi_threads = NULL;
 231   PHSAWorkGroup wg;
 232   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
 233   fiber_barrier_t wg_start_barrier;
 234   fiber_barrier_t wg_completion_barrier;
 235   fiber_barrier_t wg_sync_barrier;
 236
 237   max_x = wg_size_x == 0 ? 1 : wg_size_x;
 238   max_y = wg_size_y == 0 ? 1 : wg_size_y;
 239   max_z = wg_size_z == 0 ? 1 : wg_size_z;
 240
 241   size_t wg_size = max_x * max_y * max_z;
 242   if (wg_size > PHSA_MAX_WG_SIZE)
 243     phsa_fatal_error (2);
 244
 245   wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
 246   if (wg.private_segment_total_size > 0
 247       && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 248                          wg.private_segment_total_size)
 249            != 0)
 250     phsa_fatal_error (3);
 251
 252   wg.alloca_stack_p = wg.private_segment_total_size;
 253   wg.alloca_frame_p = wg.alloca_stack_p;
 254
 255 #ifdef EXECUTE_WGS_BACKWARDS
 256   wg.x = context->wg_max_x - 1;
 257   wg.y = context->wg_max_y - 1;
 258   wg.z = context->wg_max_z - 1;
 259 #else
 260   wg.x = context->wg_min_x;
 261   wg.y = context->wg_min_y;
 262   wg.z = context->wg_min_z;
 263 #endif
 264
 265   fiber_barrier_init (&wg_sync_barrier, wg_size);
 266   fiber_barrier_init (&wg_start_barrier, wg_size);
 267   fiber_barrier_init (&wg_completion_barrier, wg_size);
 268
 269   context->wg_start_barrier = &wg_start_barrier;
 270   context->wg_sync_barrier = &wg_sync_barrier;
 271   context->wg_completion_barrier = &wg_completion_barrier;
 272
 273   wg.more_wgs = 1;
 274   wg.group_base_ptr = group_base_ptr;
 275
 276 #ifdef BENCHMARK_PHSA_RT
 277   wi_count = 0;
 278   wis_skipped = 0;
 279   start_time = clock ();
 280 #endif
 281   wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
 282   for (x = 0; x < max_x; ++x)
 283     for (y = 0; y < max_y; ++y)
 284       for (z = 0; z < max_z; ++z)
 285         {
 286           PHSAWorkItem *wi = &wi_threads[flat_wi_id];
 287           wi->launch_data = context;
 288           wi->wg = &wg;
 289           wi->x = x;
 290           wi->y = y;
 291           wi->z = z;
 292
 293           /* TODO: set the stack size according to the private
 294                    segment size.  Too big stack consumes huge amount of
 295                    memory in case of huge number of WIs and a too small stack
 296                    will fail in mysterious and potentially dangerous ways.  */
 297
 298           fiber_init (&wi->fiber, phsa_work_item_thread, wi,
 299                       FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
 300           ++flat_wi_id;
 301         }
 302
 303   do
 304     {
 305       --flat_wi_id;
 306       fiber_join (&wi_threads[flat_wi_id].fiber);
 307     }
 308   while (flat_wi_id > 0);
 309
 310   if (wg.private_segment_total_size > 0)
 311     free (wg.private_base_ptr);
 312
 313   free (wi_threads);
 314 }
 315
 316 /* Spawn the work-item threads to execute work-groups and let
 317    them execute all the WGs, including a potential partial WG.  */
 318
 319 static void
 320 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr)
 321 {
 322   hsa_kernel_dispatch_packet_t *dp = context->dp;
 323   size_t x, y, z;
 324
 325   /* TO DO: host-side memory management of group and private segment
 326      memory.  Agents in general are less likely to support efficient dynamic mem
 327      allocation.  */
 328   if (dp->group_segment_size > 0
 329       && posix_memalign (&group_base_ptr, PRIVATE_SEGMENT_ALIGN,
 330                          dp->group_segment_size) != 0)
 331     phsa_fatal_error (3);
 332
 333   context->group_segment_start_addr = (size_t) group_base_ptr;
 334
 335   /* HSA seems to allow the WG size to be larger than the grid size.  We need to
 336      saturate the effective WG size to the grid size to prevent the extra WIs
 337      from executing.  */
 338   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 339   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 340   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 341   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 342   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 343
 344 #ifdef BENCHMARK_PHSA_RT
 345   wi_total = (uint64_t) dp->grid_size_x
 346              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 347              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 348 #endif
 349
 350   /* For now execute all work groups in a single coarse thread (does not utilize
 351      multicore/multithread).  */
 352   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 353
 354   int dims = dp->setup & 0x3;
 355
 356   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 357                       / dp->workgroup_size_x;
 358
 359   context->wg_max_y
 360     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 361                        / dp->workgroup_size_y;
 362
 363   context->wg_max_z
 364     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 365                        / dp->workgroup_size_z;
 366
 367 #ifdef DEBUG_PHSA_RT
 368   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 369           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 370           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 371           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 372           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 373           dp->grid_size_y, dp->grid_size_z);
 374 #endif
 375
 376   phsa_execute_wi_gang (context, group_base_ptr, sat_wg_size_x, sat_wg_size_y,
 377                         sat_wg_size_z);
 378
 379   if (dp->group_segment_size > 0)
 380     free (group_base_ptr);
 381 }
 382 #endif
 383
 384 /* Executes the given work-group function for all work groups in the grid.
 385
 386    A work-group function is a version of the original kernel which executes
 387    the kernel for all work-items in a work-group.  It is produced by gccbrig
 388    if it can handle the kernel's barrier usage and is much faster way to
 389    execute massive numbers of work-items in a non-SPMD machine than fibers
 390    (easily 100x faster).  */
 391 static void
 392 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr)
 393 {
 394   hsa_kernel_dispatch_packet_t *dp = context->dp;
 395   size_t x, y, z, wg_x, wg_y, wg_z;
 396
 397   /* TODO: host-side memory management of group and private segment
 398      memory.  Agents in general are less likely to support efficient dynamic mem
 399      allocation.  */
 400   if (dp->group_segment_size > 0
 401       && posix_memalign (&group_base_ptr, GROUP_SEGMENT_ALIGN,
 402                          dp->group_segment_size) != 0)
 403     phsa_fatal_error (3);
 404
 405   context->group_segment_start_addr = (size_t) group_base_ptr;
 406
 407   /* HSA seems to allow the WG size to be larger than the grid size.  We need
 408      to saturate the effective WG size to the grid size to prevent the extra WIs
 409      from executing.  */
 410   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 411   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 412   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 413   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 414   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 415
 416 #ifdef BENCHMARK_PHSA_RT
 417   wi_total = (uint64_t) dp->grid_size_x
 418              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 419              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 420 #endif
 421
 422   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 423
 424   int dims = dp->setup & 0x3;
 425
 426   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 427                       / dp->workgroup_size_x;
 428
 429   context->wg_max_y
 430     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 431                        / dp->workgroup_size_y;
 432
 433   context->wg_max_z
 434     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 435                        / dp->workgroup_size_z;
 436
 437 #ifdef DEBUG_PHSA_RT
 438   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 439           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 440           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 441           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 442           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 443           dp->grid_size_y, dp->grid_size_z);
 444 #endif
 445
 446   PHSAWorkItem wi;
 447   PHSAWorkGroup wg;
 448   wi.wg = &wg;
 449   wi.x = wi.y = wi.z = 0;
 450   wi.launch_data = context;
 451
 452 #ifdef BENCHMARK_PHSA_RT
 453   start_time = clock ();
 454   uint64_t wg_count = 0;
 455 #endif
 456
 457   size_t wg_size = __hsail_workgroupsize (0, &wi)
 458                    * __hsail_workgroupsize (1, &wi)
 459                    * __hsail_workgroupsize (2, &wi);
 460
 461   void *private_base_ptr = NULL;
 462   if (dp->private_segment_size > 0
 463       && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 464                          dp->private_segment_size * wg_size)
 465            != 0)
 466     phsa_fatal_error (3);
 467
 468   wg.alloca_stack_p = dp->private_segment_size * wg_size;
 469   wg.alloca_frame_p = wg.alloca_stack_p;
 470
 471   wg.private_base_ptr = private_base_ptr;
 472   wg.group_base_ptr = group_base_ptr;
 473
 474 #ifdef DEBUG_PHSA_RT
 475   printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
 476           wg_size, private_base_ptr);
 477 #endif
 478
 479   for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
 480     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
 481       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
 482         {
 483           wi.wg->x = wg_x;
 484           wi.wg->y = wg_y;
 485           wi.wg->z = wg_z;
 486
 487           context->kernel (context->kernarg_addr, &wi, group_base_ptr,
 488                            private_base_ptr);
 489
 490 #if defined (BENCHMARK_PHSA_RT)
 491           wg_count++;
 492           if (wg_count % 1000000 == 0)
 493             {
 494               clock_t spent_time = clock () - start_time;
 495               uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
 496                                   + wg_z * sat_wg_size_z;
 497               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 498               double wis_per_sec = wi_count / spent_time_sec;
 499               uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
 500
 501               printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
 502                       wi_count, (uint64_t) spent_time_sec,
 503                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 504             }
 505 #endif
 506         }
 507
 508 #ifdef BENCHMARK_PHSA_RT
 509   clock_t spent_time = clock () - start_time;
 510   double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 511   double wis_per_sec = wi_total / spent_time_sec;
 512
 513   printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
 514           (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
 515 #endif
 516
 517   if (dp->group_segment_size > 0)
 518     free (group_base_ptr);
 519
 520   free (private_base_ptr);
 521   private_base_ptr = NULL;
 522 }
 523
 524 /* gccbrig generates the following from each HSAIL kernel:
 525
 526    1) The actual kernel function (a single work-item kernel or a work-group
 527       function) generated from HSAIL (BRIG).
 528
 529          static void _Kernel (void* args, void* context, void* group_base_ptr)
 530          {
 531            ...
 532          }
 533
 534   2) A public facing kernel function that is called from the PHSA runtime:
 535
 536    a) A single work-item function (that requires fibers for multi-WI):
 537
 538       void Kernel (void* context)
 539       {
 540          __launch_launch_kernel (_Kernel, context);
 541       }
 542
 543       or
 544
 545     b) a when gccbrig could generate a work-group function:
 546
 547       void Kernel (void* context)
 548       {
 549                 __hsail_launch_wg_function (_Kernel, context);
 550       }
 551 */
 552
 553 #ifdef HAVE_FIBERS
 554
 555 void
 556 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
 557                        void *group_base_ptr)
 558 {
 559   context->kernel = kernel;
 560   phsa_spawn_work_items (context, group_base_ptr);
 561 }
 562 #endif
 563
 564 void
 565 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
 566                             PHSAKernelLaunchData *context, void *group_base_ptr)
 567 {
 568   context->kernel = kernel;
 569   phsa_execute_work_groups (context, group_base_ptr);
 570 }
 571
 572 uint32_t
 573 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
 574 {
 575   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 576
 577   uint32_t id;
 578   switch (dim)
 579     {
 580     default:
 581     case 0:
 582       /* Overflow semantics in the case of WG dim > grid dim.  */
 583       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 584            % dp->grid_size_x;
 585       break;
 586     case 1:
 587       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 588            % dp->grid_size_y;
 589       break;
 590     case 2:
 591       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 592            % dp->grid_size_z;
 593       break;
 594     }
 595   return id;
 596 }
 597
 598 uint64_t
 599 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
 600 {
 601   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 602
 603   uint64_t id;
 604   switch (dim)
 605     {
 606     default:
 607     case 0:
 608       /* Overflow semantics in the case of WG dim > grid dim.  */
 609       id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
 610            % dp->grid_size_x;
 611       break;
 612     case 1:
 613       id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
 614            % dp->grid_size_y;
 615       break;
 616     case 2:
 617       id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
 618            % dp->grid_size_z;
 619       break;
 620     }
 621   return id;
 622 }
 623
 624
 625 uint32_t
 626 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
 627 {
 628   PHSAWorkItem *c = (PHSAWorkItem *) context;
 629   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 630
 631   /* The number of dimensions is in the two least significant bits.  */
 632   int dims = dp->setup & 0x3;
 633
 634   uint32_t id;
 635   switch (dim)
 636     {
 637     default:
 638     case 0:
 639       id = c->x;
 640       break;
 641     case 1:
 642       id = dims < 2 ? 0 : c->y;
 643       break;
 644     case 2:
 645       id = dims < 3 ? 0 : c->z;
 646       break;
 647     }
 648   return id;
 649 }
 650
 651 uint32_t
 652 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
 653 {
 654   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 655   int dims = dp->setup & 0x3;
 656
 657   uint32_t id;
 658   switch (dim)
 659     {
 660     default:
 661     case 0:
 662       id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
 663       break;
 664     case 1:
 665       id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
 666                             / dp->workgroup_size_y;
 667       break;
 668     case 2:
 669       id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
 670                             / dp->workgroup_size_z;
 671       break;
 672     }
 673   return id;
 674 }
 675
 676 uint32_t
 677 __hsail_workitemflatid (PHSAWorkItem *c)
 678 {
 679   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 680
 681   return c->x + c->y * dp->workgroup_size_x
 682          + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
 683 }
 684
 685 uint32_t
 686 __hsail_currentworkitemflatid (PHSAWorkItem *c)
 687 {
 688   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 689
 690   return c->x + c->y * __hsail_currentworkgroupsize (0, c)
 691          + c->z * __hsail_currentworkgroupsize (0, c)
 692              * __hsail_currentworkgroupsize (1, c);
 693 }
 694
 695 void
 696 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
 697 {
 698   switch (dim)
 699     {
 700     default:
 701     case 0:
 702       context->x = id;
 703       break;
 704     case 1:
 705       context->y = id;
 706       break;
 707     case 2:
 708       context->z = id;
 709       break;
 710     }
 711 }
 712
 713 uint64_t
 714 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
 715 {
 716   PHSAWorkItem *c = (PHSAWorkItem *) context;
 717   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 718
 719   /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 720   uint64_t id0 = __hsail_workitemabsid (0, context);
 721   uint64_t id1 = __hsail_workitemabsid (1, context);
 722   uint64_t id2 = __hsail_workitemabsid (2, context);
 723
 724   uint64_t max0 = dp->grid_size_x;
 725   uint64_t max1 = dp->grid_size_y;
 726   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 727
 728   return id;
 729 }
 730
 731 uint32_t
 732 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
 733 {
 734   PHSAWorkItem *c = (PHSAWorkItem *) context;
 735   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 736
 737   /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 738   uint64_t id0 = __hsail_workitemabsid (0, context);
 739   uint64_t id1 = __hsail_workitemabsid (1, context);
 740   uint64_t id2 = __hsail_workitemabsid (2, context);
 741
 742   uint64_t max0 = dp->grid_size_x;
 743   uint64_t max1 = dp->grid_size_y;
 744   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 745   return (uint32_t) id;
 746 }
 747
 748 uint32_t
 749 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
 750 {
 751   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 752   uint32_t wg_size = 0;
 753   switch (dim)
 754     {
 755     default:
 756     case 0:
 757       if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
 758         wg_size = dp->workgroup_size_x; /* Full WG.  */
 759       else
 760         wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
 761       break;
 762     case 1:
 763       if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
 764         wg_size = dp->workgroup_size_y; /* Full WG.  */
 765       else
 766         wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
 767       break;
 768     case 2:
 769       if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
 770         wg_size = dp->workgroup_size_z; /* Full WG.  */
 771       else
 772         wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
 773       break;
 774     }
 775   return wg_size;
 776 }
 777
 778 uint32_t
 779 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
 780 {
 781   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 782   switch (dim)
 783     {
 784     default:
 785     case 0:
 786       return dp->workgroup_size_x;
 787     case 1:
 788       return dp->workgroup_size_y;
 789     case 2:
 790       return dp->workgroup_size_z;
 791     }
 792 }
 793
 794 uint32_t
 795 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
 796 {
 797   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 798   switch (dim)
 799     {
 800     default:
 801     case 0:
 802       return dp->grid_size_x;
 803     case 1:
 804       return dp->grid_size_y;
 805     case 2:
 806       return dp->grid_size_z;
 807     }
 808 }
 809
 810 uint32_t
 811 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
 812 {
 813   switch (dim)
 814     {
 815     default:
 816     case 0:
 817       return wi->wg->x;
 818     case 1:
 819       return wi->wg->y;
 820     case 2:
 821       return wi->wg->z;
 822     }
 823 }
 824
 825 uint32_t
 826 __hsail_dim (PHSAWorkItem *wi)
 827 {
 828   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 829   return dp->setup & 0x3;
 830 }
 831
 832 uint64_t
 833 __hsail_packetid (PHSAWorkItem *wi)
 834 {
 835   return wi->launch_data->packet_id;
 836 }
 837
 838 uint32_t
 839 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
 840 {
 841   return (uint32_t) wi->launch_data->dp->completion_signal.handle;
 842 }
 843
 844 uint64_t
 845 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
 846 {
 847   return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
 848 }
 849
 850 #ifdef HAVE_FIBERS
 851 void
 852 __hsail_barrier (PHSAWorkItem *wi)
 853 {
 854   fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
 855 }
 856 #endif
 857
 858 /* Return a 32b private segment address that points to a dynamically
 859    allocated chunk of 'size' with 'align'.
 860
 861    Allocates the space from the end of the private segment allocated
 862    for the whole work group.  In implementations with separate private
 863    memories per WI, we will need to have a stack pointer per WI.  But in
 864    the current implementation, the segment is shared, so we possibly
 865    save some space in case all WIs do not call the alloca.
 866
 867    The "alloca frames" are organized as follows:
 868
 869    wg->alloca_stack_p points to the last allocated data (initially
 870    outside the private segment)
 871    wg->alloca_frame_p points to the first address _outside_ the current
 872    function's allocations (initially to the same as alloca_stack_p)
 873
 874    The data is allocated downwards from the end of the private segment.
 875
 876    In the beginning of a new function which has allocas, a new alloca
 877    frame is pushed which adds the current alloca_frame_p (the current
 878    function's frame starting point) to the top of the alloca stack and
 879    alloca_frame_p is set to the current stack position.
 880
 881    At the exit points of a function with allocas, the alloca frame
 882    is popped before returning.  This involves popping the alloca_frame_p
 883    to the one of the previous function in the call stack, and alloca_stack_p
 884    similarly, to the position of the last word alloca'd by the previous
 885    function.
 886  */
 887
 888 uint32_t
 889 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
 890 {
 891   volatile PHSAWorkGroup *wg = wi->wg;
 892   uint32_t new_pos = wg->alloca_stack_p - size;
 893   while (new_pos % align != 0)
 894     new_pos--;
 895   wg->alloca_stack_p = new_pos;
 896
 897 #ifdef DEBUG_ALLOCA
 898   printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
 899           wg->alloca_stack_p, wg->alloca_frame_p);
 900 #endif
 901   return new_pos;
 902 }
 903
 904 /* Initializes a new "alloca frame" in the private segment.
 905    This should be called at all the function entry points in case
 906    the function contains at least one call to alloca.  */
 907
 908 void
 909 __hsail_alloca_push_frame (PHSAWorkItem *wi)
 910 {
 911   volatile PHSAWorkGroup *wg = wi->wg;
 912
 913   /* Store the alloca_frame_p without any alignment padding so
 914      we know exactly where the previous frame ended after popping
 915      it.  */
 916 #ifdef DEBUG_ALLOCA
 917   printf ("--- push frame ");
 918 #endif
 919   uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
 920   memcpy (wg->private_base_ptr + last_word_offs,
 921           (const void *) &wg->alloca_frame_p, 4);
 922   wg->alloca_frame_p = last_word_offs;
 923
 924 #ifdef DEBUG_ALLOCA
 925   printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
 926 #endif
 927 }
 928
 929 /* Frees the current "alloca frame" and restores the frame
 930    pointer.
 931    This should be called at all the function return points in case
 932    the function contains at least one call to alloca.  Restores the
 933    alloca stack to the condition it was before pushing the frame
 934    the last time.  */
 935 void
 936 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
 937 {
 938   volatile PHSAWorkGroup *wg = wi->wg;
 939
 940   wg->alloca_stack_p = wg->alloca_frame_p;
 941   memcpy ((void *) &wg->alloca_frame_p,
 942           (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
 943   /* Now frame_p points to the beginning of the previous function's
 944      frame and stack_p to its end.  */
 945
 946   wg->alloca_stack_p += 4;
 947
 948 #ifdef DEBUG_ALLOCA
 949   printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
 950           wg->alloca_frame_p);
 951 #endif
 952 }