PR tree-optimization/71437
[official-gcc.git] / libhsail-rt / rt / workitems.c
blob1114e59555634d41602ae45b1615ac059f8a3cf1
1 /* workitems.c -- The main runtime entry that performs work-item execution in
2 various ways and the builtin functions closely related to the
3 implementation.
5 Copyright (C) 2015-2017 Free Software Foundation, Inc.
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
7 for General Processor Tech.
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files
11 (the "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
29 /* The fiber based multiple work-item work-group execution uses ucontext
30 based user mode threading. However, if gccbrig is able to optimize the
31 kernel to a much faster work-group function that implements the multiple
32 WI execution using loops instead of fibers requiring slow context switches,
33 the fiber-based implementation won't be called.
36 #include <stdlib.h>
37 #include <signal.h>
38 #include <string.h>
40 #include "workitems.h"
41 #include "phsa-rt.h"
43 #ifdef HAVE_FIBERS
44 #include "fibers.h"
45 #endif
47 #ifdef BENCHMARK_PHSA_RT
48 #include <stdio.h>
49 #include <time.h>
51 static uint64_t wi_count = 0;
52 static uint64_t wis_skipped = 0;
53 static uint64_t wi_total = 0;
54 static clock_t start_time;
56 #endif
58 #ifdef DEBUG_PHSA_RT
59 #include <stdio.h>
60 #endif
62 #define PRIVATE_SEGMENT_ALIGN 256
63 #define FIBER_STACK_SIZE (64*1024)
64 #define GROUP_SEGMENT_ALIGN 256
66 /* HSA requires WGs to be executed in flat work-group id order. Enabling
67 the following macro can reveal test cases that rely on the ordering,
68 but is not useful for much else. */
70 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
72 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
74 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
76 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
78 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
80 void
81 phsa_fatal_error (int code)
83 exit (code);
86 #ifdef HAVE_FIBERS
87 /* ucontext-based work-item thread implementation. Runs all work-items in
88 separate fibers. */
90 static void
91 phsa_work_item_thread (int arg0, int arg1)
93 void *arg = fiber_int_args_to_ptr (arg0, arg1);
95 PHSAWorkItem *wi = (PHSAWorkItem *) arg;
96 volatile PHSAWorkGroup *wg = wi->wg;
97 PHSAKernelLaunchData *l_data = wi->launch_data;
101 int retcode
102 = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
104 /* At this point the threads can assume that either more_wgs is 0 or
105 the current_work_group_* is set to point to the WG executed next. */
106 if (!wi->wg->more_wgs)
107 break;
108 #ifdef DEBUG_PHSA_RT
109 printf (
110 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
111 wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
112 l_data->wg_max_y, l_data->wg_max_z);
113 #endif
115 if (wi->x < __hsail_currentworkgroupsize (0, wi)
116 && wi->y < __hsail_currentworkgroupsize (1, wi)
117 && wi->z < __hsail_currentworkgroupsize (2, wi))
119 l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
120 wg->private_base_ptr);
121 #ifdef DEBUG_PHSA_RT
122 printf ("done.\n");
123 #endif
124 #ifdef BENCHMARK_PHSA_RT
125 wi_count++;
126 #endif
128 else
130 #ifdef DEBUG_PHSA_RT
131 printf ("skipped (partial WG).\n");
132 #endif
133 #ifdef BENCHMARK_PHSA_RT
134 wis_skipped++;
135 #endif
138 retcode
139 = fiber_barrier_reach ((fiber_barrier_t *)
140 l_data->wg_completion_barrier);
142 /* The first thread updates the WG to execute next etc. */
144 if (retcode == 0)
146 #ifdef EXECUTE_WGS_BACKWARDS
147 if (wg->x == l_data->wg_min_x)
149 wg->x = l_data->wg_max_x - 1;
150 if (wg->y == l_data->wg_min_y)
152 wg->y = l_data->wg_max_y - 1;
153 if (wg->z == l_data->wg_min_z)
154 wg->more_wgs = 0;
155 else
156 wg->z--;
158 else
159 wg->y--;
161 else
162 wg->x--;
163 #else
164 if (wg->x + 1 >= l_data->wg_max_x)
166 wg->x = l_data->wg_min_x;
167 if (wg->y + 1 >= l_data->wg_max_y)
169 wg->y = l_data->wg_min_y;
170 if (wg->z + 1 >= l_data->wg_max_z)
171 wg->more_wgs = 0;
172 else
173 wg->z++;
175 else
176 wg->y++;
178 else
179 wg->x++;
180 #endif
182 /* Reinitialize the work-group barrier according to the new WG's
183 size, which might not be the same as the previous ones, due
184 to "partial WGs". */
185 size_t wg_size = __hsail_currentworkgroupsize (0, wi)
186 * __hsail_currentworkgroupsize (1, wi)
187 * __hsail_currentworkgroupsize (2, wi);
189 #ifdef DEBUG_PHSA_RT
190 printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
191 #endif
192 fiber_barrier_init ((fiber_barrier_t *)
193 wi->launch_data->wg_sync_barrier,
194 wg_size);
196 #ifdef BENCHMARK_PHSA_RT
197 if (wi_count % 1000 == 0)
199 clock_t spent_time = clock () - start_time;
200 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
201 double wis_per_sec = wi_count / spent_time_sec;
202 uint64_t eta_sec
203 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
205 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
206 "%lu s)\n",
207 wi_count, wis_skipped, (uint64_t) spent_time_sec,
208 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
210 #endif
213 while (1);
215 fiber_exit ();
217 #endif
219 #define MIN(a, b) ((a < b) ? a : b)
220 #define MAX(a, b) ((a > b) ? a : b)
222 #ifdef HAVE_FIBERS
223 /* Spawns a given number of work-items to execute a set of work-groups,
224 blocks until their completion. */
226 static void
227 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
228 size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
230 PHSAWorkItem *wi_threads = NULL;
231 PHSAWorkGroup wg;
232 size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
233 fiber_barrier_t wg_start_barrier;
234 fiber_barrier_t wg_completion_barrier;
235 fiber_barrier_t wg_sync_barrier;
237 max_x = wg_size_x == 0 ? 1 : wg_size_x;
238 max_y = wg_size_y == 0 ? 1 : wg_size_y;
239 max_z = wg_size_z == 0 ? 1 : wg_size_z;
241 size_t wg_size = max_x * max_y * max_z;
242 if (wg_size > PHSA_MAX_WG_SIZE)
243 phsa_fatal_error (2);
245 wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
246 if (wg.private_segment_total_size > 0
247 && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
248 wg.private_segment_total_size)
249 != 0)
250 phsa_fatal_error (3);
252 wg.alloca_stack_p = wg.private_segment_total_size;
253 wg.alloca_frame_p = wg.alloca_stack_p;
255 #ifdef EXECUTE_WGS_BACKWARDS
256 wg.x = context->wg_max_x - 1;
257 wg.y = context->wg_max_y - 1;
258 wg.z = context->wg_max_z - 1;
259 #else
260 wg.x = context->wg_min_x;
261 wg.y = context->wg_min_y;
262 wg.z = context->wg_min_z;
263 #endif
265 fiber_barrier_init (&wg_sync_barrier, wg_size);
266 fiber_barrier_init (&wg_start_barrier, wg_size);
267 fiber_barrier_init (&wg_completion_barrier, wg_size);
269 context->wg_start_barrier = &wg_start_barrier;
270 context->wg_sync_barrier = &wg_sync_barrier;
271 context->wg_completion_barrier = &wg_completion_barrier;
273 wg.more_wgs = 1;
274 wg.group_base_ptr = group_base_ptr;
276 #ifdef BENCHMARK_PHSA_RT
277 wi_count = 0;
278 wis_skipped = 0;
279 start_time = clock ();
280 #endif
281 wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
282 for (x = 0; x < max_x; ++x)
283 for (y = 0; y < max_y; ++y)
284 for (z = 0; z < max_z; ++z)
286 PHSAWorkItem *wi = &wi_threads[flat_wi_id];
287 wi->launch_data = context;
288 wi->wg = &wg;
289 wi->x = x;
290 wi->y = y;
291 wi->z = z;
293 /* TODO: set the stack size according to the private
294 segment size. Too big stack consumes huge amount of
295 memory in case of huge number of WIs and a too small stack
296 will fail in mysterious and potentially dangerous ways. */
298 fiber_init (&wi->fiber, phsa_work_item_thread, wi,
299 FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
300 ++flat_wi_id;
305 --flat_wi_id;
306 fiber_join (&wi_threads[flat_wi_id].fiber);
308 while (flat_wi_id > 0);
310 if (wg.private_segment_total_size > 0)
311 free (wg.private_base_ptr);
313 free (wi_threads);
316 /* Spawn the work-item threads to execute work-groups and let
317 them execute all the WGs, including a potential partial WG. */
319 static void
320 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr)
322 hsa_kernel_dispatch_packet_t *dp = context->dp;
323 size_t x, y, z;
325 /* TO DO: host-side memory management of group and private segment
326 memory. Agents in general are less likely to support efficient dynamic mem
327 allocation. */
328 if (dp->group_segment_size > 0
329 && posix_memalign (&group_base_ptr, PRIVATE_SEGMENT_ALIGN,
330 dp->group_segment_size) != 0)
331 phsa_fatal_error (3);
333 context->group_segment_start_addr = (size_t) group_base_ptr;
335 /* HSA seems to allow the WG size to be larger than the grid size. We need to
336 saturate the effective WG size to the grid size to prevent the extra WIs
337 from executing. */
338 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
339 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
340 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
341 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
342 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
344 #ifdef BENCHMARK_PHSA_RT
345 wi_total = (uint64_t) dp->grid_size_x
346 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
347 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
348 #endif
350 /* For now execute all work groups in a single coarse thread (does not utilize
351 multicore/multithread). */
352 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
354 int dims = dp->setup & 0x3;
356 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
357 / dp->workgroup_size_x;
359 context->wg_max_y
360 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
361 / dp->workgroup_size_y;
363 context->wg_max_z
364 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
365 / dp->workgroup_size_z;
367 #ifdef DEBUG_PHSA_RT
368 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
369 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
370 context->wg_min_x, context->wg_min_y, context->wg_min_z,
371 context->wg_max_x, context->wg_max_y, context->wg_max_z,
372 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
373 dp->grid_size_y, dp->grid_size_z);
374 #endif
376 phsa_execute_wi_gang (context, group_base_ptr, sat_wg_size_x, sat_wg_size_y,
377 sat_wg_size_z);
379 if (dp->group_segment_size > 0)
380 free (group_base_ptr);
382 #endif
384 /* Executes the given work-group function for all work groups in the grid.
386 A work-group function is a version of the original kernel which executes
387 the kernel for all work-items in a work-group. It is produced by gccbrig
388 if it can handle the kernel's barrier usage and is much faster way to
389 execute massive numbers of work-items in a non-SPMD machine than fibers
390 (easily 100x faster). */
391 static void
392 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr)
394 hsa_kernel_dispatch_packet_t *dp = context->dp;
395 size_t x, y, z, wg_x, wg_y, wg_z;
397 /* TODO: host-side memory management of group and private segment
398 memory. Agents in general are less likely to support efficient dynamic mem
399 allocation. */
400 if (dp->group_segment_size > 0
401 && posix_memalign (&group_base_ptr, GROUP_SEGMENT_ALIGN,
402 dp->group_segment_size) != 0)
403 phsa_fatal_error (3);
405 context->group_segment_start_addr = (size_t) group_base_ptr;
407 /* HSA seems to allow the WG size to be larger than the grid size. We need
408 to saturate the effective WG size to the grid size to prevent the extra WIs
409 from executing. */
410 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
411 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
412 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
413 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
414 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
416 #ifdef BENCHMARK_PHSA_RT
417 wi_total = (uint64_t) dp->grid_size_x
418 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
419 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
420 #endif
422 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
424 int dims = dp->setup & 0x3;
426 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
427 / dp->workgroup_size_x;
429 context->wg_max_y
430 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
431 / dp->workgroup_size_y;
433 context->wg_max_z
434 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
435 / dp->workgroup_size_z;
437 #ifdef DEBUG_PHSA_RT
438 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
439 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
440 context->wg_min_x, context->wg_min_y, context->wg_min_z,
441 context->wg_max_x, context->wg_max_y, context->wg_max_z,
442 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
443 dp->grid_size_y, dp->grid_size_z);
444 #endif
446 PHSAWorkItem wi;
447 PHSAWorkGroup wg;
448 wi.wg = &wg;
449 wi.x = wi.y = wi.z = 0;
450 wi.launch_data = context;
452 #ifdef BENCHMARK_PHSA_RT
453 start_time = clock ();
454 uint64_t wg_count = 0;
455 #endif
457 size_t wg_size = __hsail_workgroupsize (0, &wi)
458 * __hsail_workgroupsize (1, &wi)
459 * __hsail_workgroupsize (2, &wi);
461 void *private_base_ptr = NULL;
462 if (dp->private_segment_size > 0
463 && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
464 dp->private_segment_size * wg_size)
465 != 0)
466 phsa_fatal_error (3);
468 wg.alloca_stack_p = dp->private_segment_size * wg_size;
469 wg.alloca_frame_p = wg.alloca_stack_p;
471 wg.private_base_ptr = private_base_ptr;
472 wg.group_base_ptr = group_base_ptr;
474 #ifdef DEBUG_PHSA_RT
475 printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
476 wg_size, private_base_ptr);
477 #endif
479 for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
480 for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
481 for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
483 wi.wg->x = wg_x;
484 wi.wg->y = wg_y;
485 wi.wg->z = wg_z;
487 context->kernel (context->kernarg_addr, &wi, group_base_ptr,
488 private_base_ptr);
490 #if defined (BENCHMARK_PHSA_RT)
491 wg_count++;
492 if (wg_count % 1000000 == 0)
494 clock_t spent_time = clock () - start_time;
495 uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
496 + wg_z * sat_wg_size_z;
497 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
498 double wis_per_sec = wi_count / spent_time_sec;
499 uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
501 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
502 wi_count, (uint64_t) spent_time_sec,
503 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
505 #endif
508 #ifdef BENCHMARK_PHSA_RT
509 clock_t spent_time = clock () - start_time;
510 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
511 double wis_per_sec = wi_total / spent_time_sec;
513 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
514 (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
515 #endif
517 if (dp->group_segment_size > 0)
518 free (group_base_ptr);
520 free (private_base_ptr);
521 private_base_ptr = NULL;
524 /* gccbrig generates the following from each HSAIL kernel:
526 1) The actual kernel function (a single work-item kernel or a work-group
527 function) generated from HSAIL (BRIG).
529 static void _Kernel (void* args, void* context, void* group_base_ptr)
534 2) A public facing kernel function that is called from the PHSA runtime:
536 a) A single work-item function (that requires fibers for multi-WI):
538 void Kernel (void* context)
540 __launch_launch_kernel (_Kernel, context);
545 b) a when gccbrig could generate a work-group function:
547 void Kernel (void* context)
549 __hsail_launch_wg_function (_Kernel, context);
553 #ifdef HAVE_FIBERS
555 void
556 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
557 void *group_base_ptr)
559 context->kernel = kernel;
560 phsa_spawn_work_items (context, group_base_ptr);
562 #endif
564 void
565 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
566 PHSAKernelLaunchData *context, void *group_base_ptr)
568 context->kernel = kernel;
569 phsa_execute_work_groups (context, group_base_ptr);
572 uint32_t
573 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
575 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
577 uint32_t id;
578 switch (dim)
580 default:
581 case 0:
582 /* Overflow semantics in the case of WG dim > grid dim. */
583 id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
584 % dp->grid_size_x;
585 break;
586 case 1:
587 id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
588 % dp->grid_size_y;
589 break;
590 case 2:
591 id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
592 % dp->grid_size_z;
593 break;
595 return id;
598 uint64_t
599 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
601 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
603 uint64_t id;
604 switch (dim)
606 default:
607 case 0:
608 /* Overflow semantics in the case of WG dim > grid dim. */
609 id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
610 % dp->grid_size_x;
611 break;
612 case 1:
613 id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
614 % dp->grid_size_y;
615 break;
616 case 2:
617 id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
618 % dp->grid_size_z;
619 break;
621 return id;
625 uint32_t
626 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
628 PHSAWorkItem *c = (PHSAWorkItem *) context;
629 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
631 /* The number of dimensions is in the two least significant bits. */
632 int dims = dp->setup & 0x3;
634 uint32_t id;
635 switch (dim)
637 default:
638 case 0:
639 id = c->x;
640 break;
641 case 1:
642 id = dims < 2 ? 0 : c->y;
643 break;
644 case 2:
645 id = dims < 3 ? 0 : c->z;
646 break;
648 return id;
651 uint32_t
652 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
654 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
655 int dims = dp->setup & 0x3;
657 uint32_t id;
658 switch (dim)
660 default:
661 case 0:
662 id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
663 break;
664 case 1:
665 id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
666 / dp->workgroup_size_y;
667 break;
668 case 2:
669 id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
670 / dp->workgroup_size_z;
671 break;
673 return id;
676 uint32_t
677 __hsail_workitemflatid (PHSAWorkItem *c)
679 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
681 return c->x + c->y * dp->workgroup_size_x
682 + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
685 uint32_t
686 __hsail_currentworkitemflatid (PHSAWorkItem *c)
688 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
690 return c->x + c->y * __hsail_currentworkgroupsize (0, c)
691 + c->z * __hsail_currentworkgroupsize (0, c)
692 * __hsail_currentworkgroupsize (1, c);
695 void
696 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
698 switch (dim)
700 default:
701 case 0:
702 context->x = id;
703 break;
704 case 1:
705 context->y = id;
706 break;
707 case 2:
708 context->z = id;
709 break;
713 uint64_t
714 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
716 PHSAWorkItem *c = (PHSAWorkItem *) context;
717 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
719 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
720 uint64_t id0 = __hsail_workitemabsid (0, context);
721 uint64_t id1 = __hsail_workitemabsid (1, context);
722 uint64_t id2 = __hsail_workitemabsid (2, context);
724 uint64_t max0 = dp->grid_size_x;
725 uint64_t max1 = dp->grid_size_y;
726 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
728 return id;
731 uint32_t
732 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
734 PHSAWorkItem *c = (PHSAWorkItem *) context;
735 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
737 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
738 uint64_t id0 = __hsail_workitemabsid (0, context);
739 uint64_t id1 = __hsail_workitemabsid (1, context);
740 uint64_t id2 = __hsail_workitemabsid (2, context);
742 uint64_t max0 = dp->grid_size_x;
743 uint64_t max1 = dp->grid_size_y;
744 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
745 return (uint32_t) id;
748 uint32_t
749 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
751 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
752 uint32_t wg_size = 0;
753 switch (dim)
755 default:
756 case 0:
757 if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
758 wg_size = dp->workgroup_size_x; /* Full WG. */
759 else
760 wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
761 break;
762 case 1:
763 if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
764 wg_size = dp->workgroup_size_y; /* Full WG. */
765 else
766 wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
767 break;
768 case 2:
769 if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
770 wg_size = dp->workgroup_size_z; /* Full WG. */
771 else
772 wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
773 break;
775 return wg_size;
778 uint32_t
779 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
781 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
782 switch (dim)
784 default:
785 case 0:
786 return dp->workgroup_size_x;
787 case 1:
788 return dp->workgroup_size_y;
789 case 2:
790 return dp->workgroup_size_z;
794 uint32_t
795 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
797 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
798 switch (dim)
800 default:
801 case 0:
802 return dp->grid_size_x;
803 case 1:
804 return dp->grid_size_y;
805 case 2:
806 return dp->grid_size_z;
810 uint32_t
811 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
813 switch (dim)
815 default:
816 case 0:
817 return wi->wg->x;
818 case 1:
819 return wi->wg->y;
820 case 2:
821 return wi->wg->z;
825 uint32_t
826 __hsail_dim (PHSAWorkItem *wi)
828 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
829 return dp->setup & 0x3;
832 uint64_t
833 __hsail_packetid (PHSAWorkItem *wi)
835 return wi->launch_data->packet_id;
838 uint32_t
839 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
841 return (uint32_t) wi->launch_data->dp->completion_signal.handle;
844 uint64_t
845 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
847 return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
850 #ifdef HAVE_FIBERS
851 void
852 __hsail_barrier (PHSAWorkItem *wi)
854 fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
856 #endif
858 /* Return a 32b private segment address that points to a dynamically
859 allocated chunk of 'size' with 'align'.
861 Allocates the space from the end of the private segment allocated
862 for the whole work group. In implementations with separate private
863 memories per WI, we will need to have a stack pointer per WI. But in
864 the current implementation, the segment is shared, so we possibly
865 save some space in case all WIs do not call the alloca.
867 The "alloca frames" are organized as follows:
869 wg->alloca_stack_p points to the last allocated data (initially
870 outside the private segment)
871 wg->alloca_frame_p points to the first address _outside_ the current
872 function's allocations (initially to the same as alloca_stack_p)
874 The data is allocated downwards from the end of the private segment.
876 In the beginning of a new function which has allocas, a new alloca
877 frame is pushed which adds the current alloca_frame_p (the current
878 function's frame starting point) to the top of the alloca stack and
879 alloca_frame_p is set to the current stack position.
881 At the exit points of a function with allocas, the alloca frame
882 is popped before returning. This involves popping the alloca_frame_p
883 to the one of the previous function in the call stack, and alloca_stack_p
884 similarly, to the position of the last word alloca'd by the previous
885 function.
888 uint32_t
889 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
891 volatile PHSAWorkGroup *wg = wi->wg;
892 uint32_t new_pos = wg->alloca_stack_p - size;
893 while (new_pos % align != 0)
894 new_pos--;
895 wg->alloca_stack_p = new_pos;
897 #ifdef DEBUG_ALLOCA
898 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
899 wg->alloca_stack_p, wg->alloca_frame_p);
900 #endif
901 return new_pos;
904 /* Initializes a new "alloca frame" in the private segment.
905 This should be called at all the function entry points in case
906 the function contains at least one call to alloca. */
908 void
909 __hsail_alloca_push_frame (PHSAWorkItem *wi)
911 volatile PHSAWorkGroup *wg = wi->wg;
913 /* Store the alloca_frame_p without any alignment padding so
914 we know exactly where the previous frame ended after popping
915 it. */
916 #ifdef DEBUG_ALLOCA
917 printf ("--- push frame ");
918 #endif
919 uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
920 memcpy (wg->private_base_ptr + last_word_offs,
921 (const void *) &wg->alloca_frame_p, 4);
922 wg->alloca_frame_p = last_word_offs;
924 #ifdef DEBUG_ALLOCA
925 printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
926 #endif
929 /* Frees the current "alloca frame" and restores the frame
930 pointer.
931 This should be called at all the function return points in case
932 the function contains at least one call to alloca. Restores the
933 alloca stack to the condition it was before pushing the frame
934 the last time. */
935 void
936 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
938 volatile PHSAWorkGroup *wg = wi->wg;
940 wg->alloca_stack_p = wg->alloca_frame_p;
941 memcpy ((void *) &wg->alloca_frame_p,
942 (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
943 /* Now frame_p points to the beginning of the previous function's
944 frame and stack_p to its end. */
946 wg->alloca_stack_p += 4;
948 #ifdef DEBUG_ALLOCA
949 printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
950 wg->alloca_frame_p);
951 #endif