2018-07-13 Richard Biener <rguenther@suse.de>
[official-gcc.git] / libhsail-rt / rt / workitems.c
blobc846350e1cdb5d22d26fa1b569912cca50cd9e03
1 /* workitems.c -- The main runtime entry that performs work-item execution in
2 various ways and the builtin functions closely related to the
3 implementation.
5 Copyright (C) 2015-2018 Free Software Foundation, Inc.
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
7 for General Processor Tech.
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files
11 (the "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
29 /* The fiber based multiple work-item work-group execution uses ucontext
30 based user mode threading. However, if gccbrig is able to optimize the
31 kernel to a much faster work-group function that implements the multiple
32 WI execution using loops instead of fibers requiring slow context switches,
33 the fiber-based implementation won't be called.
36 #include <stdlib.h>
37 #include <signal.h>
38 #include <string.h>
40 #include "workitems.h"
41 #include "phsa-rt.h"
43 #ifdef HAVE_FIBERS
44 #include "fibers.h"
45 #endif
47 #ifdef BENCHMARK_PHSA_RT
48 #include <stdio.h>
49 #include <time.h>
51 static uint64_t wi_count = 0;
52 static uint64_t wis_skipped = 0;
53 static uint64_t wi_total = 0;
54 static clock_t start_time;
56 #endif
58 #ifdef DEBUG_PHSA_RT
59 #include <stdio.h>
60 #endif
62 #define PRIVATE_SEGMENT_ALIGN 256
63 #define FIBER_STACK_SIZE (64*1024)
64 #define GROUP_SEGMENT_ALIGN 256
66 /* Preserve this amount of additional space in the alloca stack as we need to
67 store the alloca frame pointer to the alloca frame, thus must preserve
68 space for it. This thus supports at most 1024 functions with allocas in
69 a call chain. */
70 #define ALLOCA_OVERHEAD 1024*4
72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
82 void
83 phsa_fatal_error (int code)
85 exit (code);
88 #ifdef HAVE_FIBERS
89 /* ucontext-based work-item thread implementation. Runs all work-items in
90 separate fibers. */
92 static void
93 phsa_work_item_thread (int arg0, int arg1)
95 void *arg = fiber_int_args_to_ptr (arg0, arg1);
97 PHSAWorkItem *wi = (PHSAWorkItem *) arg;
98 volatile PHSAWorkGroup *wg = wi->wg;
99 PHSAKernelLaunchData *l_data = wi->launch_data;
103 int retcode
104 = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
106 /* At this point the threads can assume that either more_wgs is 0 or
107 the current_work_group_* is set to point to the WG executed next. */
108 if (!wi->wg->more_wgs)
109 break;
111 wi->group_x = wg->x;
112 wi->group_y = wg->y;
113 wi->group_z = wg->z;
115 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
116 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
117 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
119 #ifdef DEBUG_PHSA_RT
120 printf (
121 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
122 wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
123 l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
124 #endif
126 if (wi->x < __hsail_currentworkgroupsize (0, wi)
127 && wi->y < __hsail_currentworkgroupsize (1, wi)
128 && wi->z < __hsail_currentworkgroupsize (2, wi))
130 l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
131 wg->initial_group_offset, wg->private_base_ptr);
132 #ifdef DEBUG_PHSA_RT
133 printf ("done.\n");
134 #endif
135 #ifdef BENCHMARK_PHSA_RT
136 wi_count++;
137 #endif
139 else
141 #ifdef DEBUG_PHSA_RT
142 printf ("skipped (partial WG).\n");
143 #endif
144 #ifdef BENCHMARK_PHSA_RT
145 wis_skipped++;
146 #endif
149 retcode
150 = fiber_barrier_reach ((fiber_barrier_t *)
151 l_data->wg_completion_barrier);
153 /* The first thread updates the WG to execute next etc. */
155 if (retcode == 0)
157 #ifdef EXECUTE_WGS_BACKWARDS
158 if (wg->x == l_data->wg_min_x)
160 wg->x = l_data->wg_max_x - 1;
161 if (wg->y == l_data->wg_min_y)
163 wg->y = l_data->wg_max_y - 1;
164 if (wg->z == l_data->wg_min_z)
165 wg->more_wgs = 0;
166 else
167 wg->z--;
169 else
170 wg->y--;
172 else
173 wg->x--;
174 #else
175 if (wg->x + 1 >= l_data->wg_max_x)
177 wg->x = l_data->wg_min_x;
178 if (wg->y + 1 >= l_data->wg_max_y)
180 wg->y = l_data->wg_min_y;
181 if (wg->z + 1 >= l_data->wg_max_z)
182 wg->more_wgs = 0;
183 else
184 wg->z++;
186 else
187 wg->y++;
189 else
190 wg->x++;
191 #endif
192 wi->group_x = wg->x;
193 wi->group_y = wg->y;
194 wi->group_z = wg->z;
196 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
197 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
198 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
200 /* Reinitialize the work-group barrier according to the new WG's
201 size, which might not be the same as the previous ones, due
202 to "partial WGs". */
203 size_t wg_size = __hsail_currentworkgroupsize (0, wi)
204 * __hsail_currentworkgroupsize (1, wi)
205 * __hsail_currentworkgroupsize (2, wi);
207 #ifdef DEBUG_PHSA_RT
208 printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
209 #endif
210 fiber_barrier_init ((fiber_barrier_t *)
211 wi->launch_data->wg_sync_barrier,
212 wg_size);
214 #ifdef BENCHMARK_PHSA_RT
215 if (wi_count % 1000 == 0)
217 clock_t spent_time = clock () - start_time;
218 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
219 double wis_per_sec = wi_count / spent_time_sec;
220 uint64_t eta_sec
221 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
223 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
224 "%lu s)\n",
225 wi_count, wis_skipped, (uint64_t) spent_time_sec,
226 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
228 #endif
231 while (1);
233 fiber_exit ();
235 #endif
237 #define MIN(a, b) ((a < b) ? a : b)
238 #define MAX(a, b) ((a > b) ? a : b)
240 #ifdef HAVE_FIBERS
241 /* Spawns a given number of work-items to execute a set of work-groups,
242 blocks until their completion. */
244 static void
245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
246 uint32_t group_local_offset, size_t wg_size_x,
247 size_t wg_size_y, size_t wg_size_z)
249 PHSAWorkItem *wi_threads = NULL;
250 PHSAWorkGroup wg;
251 size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
252 uint32_t group_x, group_y, group_z;
253 fiber_barrier_t wg_start_barrier;
254 fiber_barrier_t wg_completion_barrier;
255 fiber_barrier_t wg_sync_barrier;
257 max_x = wg_size_x == 0 ? 1 : wg_size_x;
258 max_y = wg_size_y == 0 ? 1 : wg_size_y;
259 max_z = wg_size_z == 0 ? 1 : wg_size_z;
261 size_t wg_size = max_x * max_y * max_z;
262 if (wg_size > PHSA_MAX_WG_SIZE)
263 phsa_fatal_error (2);
265 wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
266 if (wg.private_segment_total_size > 0
267 && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
268 wg.private_segment_total_size)
269 != 0)
270 phsa_fatal_error (3);
272 wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
273 wg.alloca_frame_p = wg.alloca_stack_p;
274 wg.initial_group_offset = group_local_offset;
276 #ifdef EXECUTE_WGS_BACKWARDS
277 group_x = context->wg_max_x - 1;
278 group_y = context->wg_max_y - 1;
279 group_z = context->wg_max_z - 1;
280 #else
281 group_x = context->wg_min_x;
282 group_y = context->wg_min_y;
283 group_z = context->wg_min_z;
284 #endif
286 fiber_barrier_init (&wg_sync_barrier, wg_size);
287 fiber_barrier_init (&wg_start_barrier, wg_size);
288 fiber_barrier_init (&wg_completion_barrier, wg_size);
290 context->wg_start_barrier = &wg_start_barrier;
291 context->wg_sync_barrier = &wg_sync_barrier;
292 context->wg_completion_barrier = &wg_completion_barrier;
294 wg.more_wgs = 1;
295 wg.group_base_ptr = group_base_ptr;
297 #ifdef BENCHMARK_PHSA_RT
298 wi_count = 0;
299 wis_skipped = 0;
300 start_time = clock ();
301 #endif
302 wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
303 for (x = 0; x < max_x; ++x)
304 for (y = 0; y < max_y; ++y)
305 for (z = 0; z < max_z; ++z)
307 PHSAWorkItem *wi = &wi_threads[flat_wi_id];
308 wi->launch_data = context;
309 wi->wg = &wg;
311 wg.x = wi->group_x = group_x;
312 wg.y = wi->group_y = group_y;
313 wg.z = wi->group_z = group_z;
315 wi->wg_size_x = context->dp->workgroup_size_x;
316 wi->wg_size_y = context->dp->workgroup_size_y;
317 wi->wg_size_z = context->dp->workgroup_size_z;
319 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
320 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
321 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
323 wi->x = x;
324 wi->y = y;
325 wi->z = z;
327 /* TODO: set the stack size according to the private
328 segment size. Too big stack consumes huge amount of
329 memory in case of huge number of WIs and a too small stack
330 will fail in mysterious and potentially dangerous ways. */
332 fiber_init (&wi->fiber, phsa_work_item_thread, wi,
333 FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
334 ++flat_wi_id;
339 --flat_wi_id;
340 fiber_join (&wi_threads[flat_wi_id].fiber);
342 while (flat_wi_id > 0);
344 if (wg.private_segment_total_size > 0)
345 free (wg.private_base_ptr);
347 free (wi_threads);
350 /* Spawn the work-item threads to execute work-groups and let
351 them execute all the WGs, including a potential partial WG. */
353 static void
354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
355 uint32_t group_local_offset)
357 hsa_kernel_dispatch_packet_t *dp = context->dp;
358 size_t x, y, z;
360 context->group_segment_start_addr = (size_t) group_base_ptr;
362 /* HSA seems to allow the WG size to be larger than the grid size. We need to
363 saturate the effective WG size to the grid size to prevent the extra WIs
364 from executing. */
365 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
366 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
367 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
368 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
369 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
371 #ifdef BENCHMARK_PHSA_RT
372 wi_total = (uint64_t) dp->grid_size_x
373 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
374 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
375 #endif
377 /* For now execute all work groups in a single coarse thread (does not utilize
378 multicore/multithread). */
379 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
381 int dims = dp->setup & 0x3;
383 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
384 / dp->workgroup_size_x;
386 context->wg_max_y
387 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
388 / dp->workgroup_size_y;
390 context->wg_max_z
391 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
392 / dp->workgroup_size_z;
394 #ifdef DEBUG_PHSA_RT
395 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
396 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
397 context->wg_min_x, context->wg_min_y, context->wg_min_z,
398 context->wg_max_x, context->wg_max_y, context->wg_max_z,
399 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
400 dp->grid_size_y, dp->grid_size_z);
401 #endif
403 phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
404 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
406 #endif
408 /* Executes the given work-group function for all work groups in the grid.
410 A work-group function is a version of the original kernel which executes
411 the kernel for all work-items in a work-group. It is produced by gccbrig
412 if it can handle the kernel's barrier usage and is much faster way to
413 execute massive numbers of work-items in a non-SPMD machine than fibers
414 (easily 100x faster). */
415 static void
416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
417 uint32_t group_local_offset)
419 hsa_kernel_dispatch_packet_t *dp = context->dp;
420 size_t x, y, z, wg_x, wg_y, wg_z;
422 context->group_segment_start_addr = (size_t) group_base_ptr;
424 /* HSA seems to allow the WG size to be larger than the grid size. We need
425 to saturate the effective WG size to the grid size to prevent the extra WIs
426 from executing. */
427 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
428 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
429 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
430 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
431 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
433 #ifdef BENCHMARK_PHSA_RT
434 wi_total = (uint64_t) dp->grid_size_x
435 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
436 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
437 #endif
439 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
441 int dims = dp->setup & 0x3;
443 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
444 / dp->workgroup_size_x;
446 context->wg_max_y
447 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
448 / dp->workgroup_size_y;
450 context->wg_max_z
451 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
452 / dp->workgroup_size_z;
454 #ifdef DEBUG_PHSA_RT
455 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
456 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
457 context->wg_min_x, context->wg_min_y, context->wg_min_z,
458 context->wg_max_x, context->wg_max_y, context->wg_max_z,
459 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
460 dp->grid_size_y, dp->grid_size_z);
461 #endif
463 PHSAWorkItem wi;
464 PHSAWorkGroup wg;
465 wi.wg = &wg;
466 wi.x = wi.y = wi.z = 0;
467 wi.launch_data = context;
469 #ifdef BENCHMARK_PHSA_RT
470 start_time = clock ();
471 uint64_t wg_count = 0;
472 #endif
474 size_t wg_size = __hsail_workgroupsize (0, &wi)
475 * __hsail_workgroupsize (1, &wi)
476 * __hsail_workgroupsize (2, &wi);
478 void *private_base_ptr = NULL;
479 if (dp->private_segment_size > 0
480 && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
481 dp->private_segment_size * wg_size)
482 != 0)
483 phsa_fatal_error (3);
485 wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
486 wg.alloca_frame_p = wg.alloca_stack_p;
488 wg.private_base_ptr = private_base_ptr;
489 wg.group_base_ptr = group_base_ptr;
491 #ifdef DEBUG_PHSA_RT
492 printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
493 wg_size, private_base_ptr);
494 #endif
496 for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
497 for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
498 for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
500 wi.group_x = wg_x;
501 wi.group_y = wg_y;
502 wi.group_z = wg_z;
504 wi.wg_size_x = context->dp->workgroup_size_x;
505 wi.wg_size_y = context->dp->workgroup_size_y;
506 wi.wg_size_z = context->dp->workgroup_size_z;
508 wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
509 wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
510 wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
512 context->kernel (context->kernarg_addr, &wi, group_base_ptr,
513 group_local_offset, private_base_ptr);
515 #if defined (BENCHMARK_PHSA_RT)
516 wg_count++;
517 if (wg_count % 1000000 == 0)
519 clock_t spent_time = clock () - start_time;
520 uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
521 + wg_z * sat_wg_size_z;
522 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
523 double wis_per_sec = wi_count / spent_time_sec;
524 uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
526 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
527 wi_count, (uint64_t) spent_time_sec,
528 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
530 #endif
533 #ifdef BENCHMARK_PHSA_RT
534 clock_t spent_time = clock () - start_time;
535 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
536 double wis_per_sec = wi_total / spent_time_sec;
538 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
539 (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
540 #endif
541 free (private_base_ptr);
542 private_base_ptr = NULL;
545 /* gccbrig generates the following from each HSAIL kernel:
547 1) The actual kernel function (a single work-item kernel or a work-group
548 function) generated from HSAIL (BRIG).
550 static void _Kernel (void* args, void* context, void* group_base_ptr)
555 2) A public facing kernel function that is called from the PHSA runtime:
557 a) A single work-item function (that requires fibers for multi-WI):
559 void Kernel (void* context)
561 __launch_launch_kernel (_Kernel, context);
566 b) a when gccbrig could generate a work-group function:
568 void Kernel (void* context)
570 __hsail_launch_wg_function (_Kernel, context);
574 #ifdef HAVE_FIBERS
576 void
577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
578 void *group_base_ptr, uint32_t group_local_offset)
580 context->kernel = kernel;
581 phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
583 #endif
585 void
586 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
587 PHSAKernelLaunchData *context, void *group_base_ptr,
588 uint32_t group_local_offset)
590 context->kernel = kernel;
591 phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
594 uint32_t
595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
597 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
599 uint32_t id;
600 switch (dim)
602 default:
603 case 0:
604 /* Overflow semantics in the case of WG dim > grid dim. */
605 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
606 % dp->grid_size_x;
607 break;
608 case 1:
609 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
610 % dp->grid_size_y;
611 break;
612 case 2:
613 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
614 % dp->grid_size_z;
615 break;
617 return id;
620 uint64_t
621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
623 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
625 uint64_t id;
626 switch (dim)
628 default:
629 case 0:
630 /* Overflow semantics in the case of WG dim > grid dim. */
631 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
632 % dp->grid_size_x;
633 break;
634 case 1:
635 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
636 % dp->grid_size_y;
637 break;
638 case 2:
639 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
640 % dp->grid_size_z;
641 break;
643 return id;
647 uint32_t
648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
650 PHSAWorkItem *c = (PHSAWorkItem *) context;
651 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
653 /* The number of dimensions is in the two least significant bits. */
654 int dims = dp->setup & 0x3;
656 uint32_t id;
657 switch (dim)
659 default:
660 case 0:
661 id = c->x;
662 break;
663 case 1:
664 id = dims < 2 ? 0 : c->y;
665 break;
666 case 2:
667 id = dims < 3 ? 0 : c->z;
668 break;
670 return id;
673 uint32_t
674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
676 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
677 int dims = dp->setup & 0x3;
679 uint32_t id;
680 switch (dim)
682 default:
683 case 0:
684 id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
685 break;
686 case 1:
687 id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
688 / dp->workgroup_size_y;
689 break;
690 case 2:
691 id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
692 / dp->workgroup_size_z;
693 break;
695 return id;
698 uint32_t
699 __hsail_workitemflatid (PHSAWorkItem *c)
701 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
703 return c->x + c->y * dp->workgroup_size_x
704 + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
707 uint32_t
708 __hsail_currentworkitemflatid (PHSAWorkItem *c)
710 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
712 return c->x + c->y * __hsail_currentworkgroupsize (0, c)
713 + c->z * __hsail_currentworkgroupsize (0, c)
714 * __hsail_currentworkgroupsize (1, c);
717 void
718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
720 switch (dim)
722 default:
723 case 0:
724 context->x = id;
725 break;
726 case 1:
727 context->y = id;
728 break;
729 case 2:
730 context->z = id;
731 break;
735 uint64_t
736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
738 PHSAWorkItem *c = (PHSAWorkItem *) context;
739 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
741 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
742 uint64_t id0 = __hsail_workitemabsid (0, context);
743 uint64_t id1 = __hsail_workitemabsid (1, context);
744 uint64_t id2 = __hsail_workitemabsid (2, context);
746 uint64_t max0 = dp->grid_size_x;
747 uint64_t max1 = dp->grid_size_y;
748 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
750 return id;
753 uint32_t
754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
756 PHSAWorkItem *c = (PHSAWorkItem *) context;
757 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
759 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
760 uint64_t id0 = __hsail_workitemabsid (0, context);
761 uint64_t id1 = __hsail_workitemabsid (1, context);
762 uint64_t id2 = __hsail_workitemabsid (2, context);
764 uint64_t max0 = dp->grid_size_x;
765 uint64_t max1 = dp->grid_size_y;
766 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
767 return (uint32_t) id;
770 uint32_t
771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
773 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
774 uint32_t wg_size = 0;
775 switch (dim)
777 default:
778 case 0:
779 if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
780 wg_size = dp->workgroup_size_x; /* Full WG. */
781 else
782 wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
783 break;
784 case 1:
785 if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
786 wg_size = dp->workgroup_size_y; /* Full WG. */
787 else
788 wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
789 break;
790 case 2:
791 if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
792 wg_size = dp->workgroup_size_z; /* Full WG. */
793 else
794 wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
795 break;
797 return wg_size;
800 uint32_t
801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
803 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
804 switch (dim)
806 default:
807 case 0:
808 return dp->workgroup_size_x;
809 case 1:
810 return dp->workgroup_size_y;
811 case 2:
812 return dp->workgroup_size_z;
816 uint32_t
817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
819 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
820 switch (dim)
822 default:
823 case 0:
824 return dp->grid_size_x;
825 case 1:
826 return dp->grid_size_y;
827 case 2:
828 return dp->grid_size_z;
832 uint32_t
833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
835 switch (dim)
837 default:
838 case 0:
839 return wi->group_x;
840 case 1:
841 return wi->group_y;
842 case 2:
843 return wi->group_z;
847 uint32_t
848 __hsail_dim (PHSAWorkItem *wi)
850 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
851 return dp->setup & 0x3;
854 uint64_t
855 __hsail_packetid (PHSAWorkItem *wi)
857 return wi->launch_data->packet_id;
860 uint32_t
861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
863 return (uint32_t) wi->launch_data->dp->completion_signal.handle;
866 uint64_t
867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
869 return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
872 #ifdef HAVE_FIBERS
873 void
874 __hsail_barrier (PHSAWorkItem *wi)
876 fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
878 #endif
880 /* Return a 32b private segment address that points to a dynamically
881 allocated chunk of 'size' with 'align'.
883 Allocates the space from the end of the private segment allocated
884 for the whole work group. In implementations with separate private
885 memories per WI, we will need to have a stack pointer per WI. But in
886 the current implementation, the segment is shared, so we possibly
887 save some space in case all WIs do not call the alloca.
889 The "alloca frames" are organized as follows:
891 wg->alloca_stack_p points to the last allocated data (initially
892 outside the private segment)
893 wg->alloca_frame_p points to the first address _outside_ the current
894 function's allocations (initially to the same as alloca_stack_p)
896 The data is allocated downwards from the end of the private segment.
898 In the beginning of a new function which has allocas, a new alloca
899 frame is pushed which adds the current alloca_frame_p (the current
900 function's frame starting point) to the top of the alloca stack and
901 alloca_frame_p is set to the current stack position.
903 At the exit points of a function with allocas, the alloca frame
904 is popped before returning. This involves popping the alloca_frame_p
905 to the one of the previous function in the call stack, and alloca_stack_p
906 similarly, to the position of the last word alloca'd by the previous
907 function.
910 uint32_t
911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
913 volatile PHSAWorkGroup *wg = wi->wg;
914 int64_t new_pos = wg->alloca_stack_p - size;
915 while (new_pos % align != 0)
916 new_pos--;
917 if (new_pos < 0)
918 phsa_fatal_error (2);
920 wg->alloca_stack_p = new_pos;
922 #ifdef DEBUG_ALLOCA
923 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
924 wg->alloca_stack_p, wg->alloca_frame_p);
925 #endif
926 return new_pos;
929 /* Initializes a new "alloca frame" in the private segment.
930 This should be called at all the function entry points in case
931 the function contains at least one call to alloca. */
933 void
934 __hsail_alloca_push_frame (PHSAWorkItem *wi)
936 volatile PHSAWorkGroup *wg = wi->wg;
938 /* Store the alloca_frame_p without any alignment padding so
939 we know exactly where the previous frame ended after popping
940 it. */
941 #ifdef DEBUG_ALLOCA
942 printf ("--- push frame ");
943 #endif
944 uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
945 memcpy (wg->private_base_ptr + last_word_offs,
946 (const void *) &wg->alloca_frame_p, 4);
947 wg->alloca_frame_p = last_word_offs;
949 #ifdef DEBUG_ALLOCA
950 printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
951 #endif
954 /* Frees the current "alloca frame" and restores the frame
955 pointer.
956 This should be called at all the function return points in case
957 the function contains at least one call to alloca. Restores the
958 alloca stack to the condition it was before pushing the frame
959 the last time. */
960 void
961 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
963 volatile PHSAWorkGroup *wg = wi->wg;
965 wg->alloca_stack_p = wg->alloca_frame_p;
966 memcpy ((void *) &wg->alloca_frame_p,
967 (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
968 /* Now frame_p points to the beginning of the previous function's
969 frame and stack_p to its end. */
971 wg->alloca_stack_p += 4;
973 #ifdef DEBUG_ALLOCA
974 printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
975 wg->alloca_frame_p);
976 #endif