1 /* workitems.c -- The main runtime entry that performs work-item execution in
2 various ways and the builtin functions closely related to the
5 Copyright (C) 2015-2017 Free Software Foundation, Inc.
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
7 for General Processor Tech.
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files
11 (the "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
29 /* The fiber based multiple work-item work-group execution uses ucontext
30 based user mode threading. However, if gccbrig is able to optimize the
31 kernel to a much faster work-group function that implements the multiple
32 WI execution using loops instead of fibers requiring slow context switches,
33 the fiber-based implementation won't be called.
40 #include "workitems.h"
47 #ifdef BENCHMARK_PHSA_RT
51 static uint64_t wi_count
= 0;
52 static uint64_t wis_skipped
= 0;
53 static uint64_t wi_total
= 0;
54 static clock_t start_time
;
62 #define PRIVATE_SEGMENT_ALIGN 256
63 #define FIBER_STACK_SIZE (64*1024)
64 #define GROUP_SEGMENT_ALIGN 256
66 /* HSA requires WGs to be executed in flat work-group id order. Enabling
67 the following macro can reveal test cases that rely on the ordering,
68 but is not useful for much else. */
70 uint32_t __hsail_workitemabsid (uint32_t dim
, PHSAWorkItem
*context
);
72 uint32_t __hsail_workitemid (uint32_t dim
, PHSAWorkItem
*context
);
74 uint32_t __hsail_gridgroups (uint32_t dim
, PHSAWorkItem
*context
);
76 uint32_t __hsail_currentworkgroupsize (uint32_t dim
, PHSAWorkItem
*wi
);
78 uint32_t __hsail_workgroupsize (uint32_t dim
, PHSAWorkItem
*wi
);
81 phsa_fatal_error (int code
)
87 /* ucontext-based work-item thread implementation. Runs all work-items in
91 phsa_work_item_thread (int arg0
, int arg1
)
93 void *arg
= fiber_int_args_to_ptr (arg0
, arg1
);
95 PHSAWorkItem
*wi
= (PHSAWorkItem
*) arg
;
96 volatile PHSAWorkGroup
*wg
= wi
->wg
;
97 PHSAKernelLaunchData
*l_data
= wi
->launch_data
;
102 = fiber_barrier_reach ((fiber_barrier_t
*) l_data
->wg_start_barrier
);
104 /* At this point the threads can assume that either more_wgs is 0 or
105 the current_work_group_* is set to point to the WG executed next. */
106 if (!wi
->wg
->more_wgs
)
110 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
111 wi
->x
, wi
->y
, wi
->z
, wg
->x
, wg
->y
, wg
->z
, l_data
->wg_max_x
,
112 l_data
->wg_max_y
, l_data
->wg_max_z
);
115 if (wi
->x
< __hsail_currentworkgroupsize (0, wi
)
116 && wi
->y
< __hsail_currentworkgroupsize (1, wi
)
117 && wi
->z
< __hsail_currentworkgroupsize (2, wi
))
119 l_data
->kernel (l_data
->kernarg_addr
, wi
, wg
->group_base_ptr
,
120 wg
->private_base_ptr
);
124 #ifdef BENCHMARK_PHSA_RT
131 printf ("skipped (partial WG).\n");
133 #ifdef BENCHMARK_PHSA_RT
139 = fiber_barrier_reach ((fiber_barrier_t
*)
140 l_data
->wg_completion_barrier
);
142 /* The first thread updates the WG to execute next etc. */
146 #ifdef EXECUTE_WGS_BACKWARDS
147 if (wg
->x
== l_data
->wg_min_x
)
149 wg
->x
= l_data
->wg_max_x
- 1;
150 if (wg
->y
== l_data
->wg_min_y
)
152 wg
->y
= l_data
->wg_max_y
- 1;
153 if (wg
->z
== l_data
->wg_min_z
)
164 if (wg
->x
+ 1 >= l_data
->wg_max_x
)
166 wg
->x
= l_data
->wg_min_x
;
167 if (wg
->y
+ 1 >= l_data
->wg_max_y
)
169 wg
->y
= l_data
->wg_min_y
;
170 if (wg
->z
+ 1 >= l_data
->wg_max_z
)
182 /* Reinitialize the work-group barrier according to the new WG's
183 size, which might not be the same as the previous ones, due
185 size_t wg_size
= __hsail_currentworkgroupsize (0, wi
)
186 * __hsail_currentworkgroupsize (1, wi
)
187 * __hsail_currentworkgroupsize (2, wi
);
190 printf ("Reinitializing the WG barrier to %lu.\n", wg_size
);
192 fiber_barrier_init ((fiber_barrier_t
*)
193 wi
->launch_data
->wg_sync_barrier
,
196 #ifdef BENCHMARK_PHSA_RT
197 if (wi_count
% 1000 == 0)
199 clock_t spent_time
= clock () - start_time
;
200 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
201 double wis_per_sec
= wi_count
/ spent_time_sec
;
203 = (wi_total
- wi_count
- wis_skipped
) / wis_per_sec
;
205 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
207 wi_count
, wis_skipped
, (uint64_t) spent_time_sec
,
208 (uint64_t) wis_per_sec
, (uint64_t) eta_sec
);
219 #define MIN(a, b) ((a < b) ? a : b)
220 #define MAX(a, b) ((a > b) ? a : b)
223 /* Spawns a given number of work-items to execute a set of work-groups,
224 blocks until their completion. */
227 phsa_execute_wi_gang (PHSAKernelLaunchData
*context
, void *group_base_ptr
,
228 size_t wg_size_x
, size_t wg_size_y
, size_t wg_size_z
)
230 PHSAWorkItem
*wi_threads
= NULL
;
232 size_t flat_wi_id
= 0, x
, y
, z
, max_x
, max_y
, max_z
;
233 fiber_barrier_t wg_start_barrier
;
234 fiber_barrier_t wg_completion_barrier
;
235 fiber_barrier_t wg_sync_barrier
;
237 max_x
= wg_size_x
== 0 ? 1 : wg_size_x
;
238 max_y
= wg_size_y
== 0 ? 1 : wg_size_y
;
239 max_z
= wg_size_z
== 0 ? 1 : wg_size_z
;
241 size_t wg_size
= max_x
* max_y
* max_z
;
242 if (wg_size
> PHSA_MAX_WG_SIZE
)
243 phsa_fatal_error (2);
245 wg
.private_segment_total_size
= context
->dp
->private_segment_size
* wg_size
;
246 if (wg
.private_segment_total_size
> 0
247 && posix_memalign (&wg
.private_base_ptr
, PRIVATE_SEGMENT_ALIGN
,
248 wg
.private_segment_total_size
)
250 phsa_fatal_error (3);
252 wg
.alloca_stack_p
= wg
.private_segment_total_size
;
253 wg
.alloca_frame_p
= wg
.alloca_stack_p
;
255 #ifdef EXECUTE_WGS_BACKWARDS
256 wg
.x
= context
->wg_max_x
- 1;
257 wg
.y
= context
->wg_max_y
- 1;
258 wg
.z
= context
->wg_max_z
- 1;
260 wg
.x
= context
->wg_min_x
;
261 wg
.y
= context
->wg_min_y
;
262 wg
.z
= context
->wg_min_z
;
265 fiber_barrier_init (&wg_sync_barrier
, wg_size
);
266 fiber_barrier_init (&wg_start_barrier
, wg_size
);
267 fiber_barrier_init (&wg_completion_barrier
, wg_size
);
269 context
->wg_start_barrier
= &wg_start_barrier
;
270 context
->wg_sync_barrier
= &wg_sync_barrier
;
271 context
->wg_completion_barrier
= &wg_completion_barrier
;
274 wg
.group_base_ptr
= group_base_ptr
;
276 #ifdef BENCHMARK_PHSA_RT
279 start_time
= clock ();
281 wi_threads
= malloc (sizeof (PHSAWorkItem
) * max_x
* max_y
* max_z
);
282 for (x
= 0; x
< max_x
; ++x
)
283 for (y
= 0; y
< max_y
; ++y
)
284 for (z
= 0; z
< max_z
; ++z
)
286 PHSAWorkItem
*wi
= &wi_threads
[flat_wi_id
];
287 wi
->launch_data
= context
;
293 /* TODO: set the stack size according to the private
294 segment size. Too big stack consumes huge amount of
295 memory in case of huge number of WIs and a too small stack
296 will fail in mysterious and potentially dangerous ways. */
298 fiber_init (&wi
->fiber
, phsa_work_item_thread
, wi
,
299 FIBER_STACK_SIZE
, PRIVATE_SEGMENT_ALIGN
);
306 fiber_join (&wi_threads
[flat_wi_id
].fiber
);
308 while (flat_wi_id
> 0);
310 if (wg
.private_segment_total_size
> 0)
311 free (wg
.private_base_ptr
);
316 /* Spawn the work-item threads to execute work-groups and let
317 them execute all the WGs, including a potential partial WG. */
320 phsa_spawn_work_items (PHSAKernelLaunchData
*context
, void *group_base_ptr
)
322 hsa_kernel_dispatch_packet_t
*dp
= context
->dp
;
325 /* TO DO: host-side memory management of group and private segment
326 memory. Agents in general are less likely to support efficient dynamic mem
328 if (dp
->group_segment_size
> 0
329 && posix_memalign (&group_base_ptr
, PRIVATE_SEGMENT_ALIGN
,
330 dp
->group_segment_size
) != 0)
331 phsa_fatal_error (3);
333 context
->group_segment_start_addr
= (size_t) group_base_ptr
;
335 /* HSA seems to allow the WG size to be larger than the grid size. We need to
336 saturate the effective WG size to the grid size to prevent the extra WIs
338 size_t sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, sat_wg_size
;
339 sat_wg_size_x
= MIN (dp
->workgroup_size_x
, dp
->grid_size_x
);
340 sat_wg_size_y
= MIN (dp
->workgroup_size_y
, dp
->grid_size_y
);
341 sat_wg_size_z
= MIN (dp
->workgroup_size_z
, dp
->grid_size_z
);
342 sat_wg_size
= sat_wg_size_x
* sat_wg_size_y
* sat_wg_size_z
;
344 #ifdef BENCHMARK_PHSA_RT
345 wi_total
= (uint64_t) dp
->grid_size_x
346 * (dp
->grid_size_y
> 0 ? dp
->grid_size_y
: 1)
347 * (dp
->grid_size_z
> 0 ? dp
->grid_size_z
: 1);
350 /* For now execute all work groups in a single coarse thread (does not utilize
351 multicore/multithread). */
352 context
->wg_min_x
= context
->wg_min_y
= context
->wg_min_z
= 0;
354 int dims
= dp
->setup
& 0x3;
356 context
->wg_max_x
= ((uint64_t) dp
->grid_size_x
+ dp
->workgroup_size_x
- 1)
357 / dp
->workgroup_size_x
;
360 = dims
< 2 ? 1 : ((uint64_t) dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
361 / dp
->workgroup_size_y
;
364 = dims
< 3 ? 1 : ((uint64_t) dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
365 / dp
->workgroup_size_z
;
368 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
369 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
370 context
->wg_min_x
, context
->wg_min_y
, context
->wg_min_z
,
371 context
->wg_max_x
, context
->wg_max_y
, context
->wg_max_z
,
372 sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, dp
->grid_size_x
,
373 dp
->grid_size_y
, dp
->grid_size_z
);
376 phsa_execute_wi_gang (context
, group_base_ptr
, sat_wg_size_x
, sat_wg_size_y
,
379 if (dp
->group_segment_size
> 0)
380 free (group_base_ptr
);
384 /* Executes the given work-group function for all work groups in the grid.
386 A work-group function is a version of the original kernel which executes
387 the kernel for all work-items in a work-group. It is produced by gccbrig
388 if it can handle the kernel's barrier usage and is much faster way to
389 execute massive numbers of work-items in a non-SPMD machine than fibers
390 (easily 100x faster). */
392 phsa_execute_work_groups (PHSAKernelLaunchData
*context
, void *group_base_ptr
)
394 hsa_kernel_dispatch_packet_t
*dp
= context
->dp
;
395 size_t x
, y
, z
, wg_x
, wg_y
, wg_z
;
397 /* TODO: host-side memory management of group and private segment
398 memory. Agents in general are less likely to support efficient dynamic mem
400 if (dp
->group_segment_size
> 0
401 && posix_memalign (&group_base_ptr
, GROUP_SEGMENT_ALIGN
,
402 dp
->group_segment_size
) != 0)
403 phsa_fatal_error (3);
405 context
->group_segment_start_addr
= (size_t) group_base_ptr
;
407 /* HSA seems to allow the WG size to be larger than the grid size. We need
408 to saturate the effective WG size to the grid size to prevent the extra WIs
410 size_t sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, sat_wg_size
;
411 sat_wg_size_x
= MIN (dp
->workgroup_size_x
, dp
->grid_size_x
);
412 sat_wg_size_y
= MIN (dp
->workgroup_size_y
, dp
->grid_size_y
);
413 sat_wg_size_z
= MIN (dp
->workgroup_size_z
, dp
->grid_size_z
);
414 sat_wg_size
= sat_wg_size_x
* sat_wg_size_y
* sat_wg_size_z
;
416 #ifdef BENCHMARK_PHSA_RT
417 wi_total
= (uint64_t) dp
->grid_size_x
418 * (dp
->grid_size_y
> 0 ? dp
->grid_size_y
: 1)
419 * (dp
->grid_size_z
> 0 ? dp
->grid_size_z
: 1);
422 context
->wg_min_x
= context
->wg_min_y
= context
->wg_min_z
= 0;
424 int dims
= dp
->setup
& 0x3;
426 context
->wg_max_x
= ((uint64_t) dp
->grid_size_x
+ dp
->workgroup_size_x
- 1)
427 / dp
->workgroup_size_x
;
430 = dims
< 2 ? 1 : ((uint64_t) dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
431 / dp
->workgroup_size_y
;
434 = dims
< 3 ? 1 : ((uint64_t) dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
435 / dp
->workgroup_size_z
;
438 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
439 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
440 context
->wg_min_x
, context
->wg_min_y
, context
->wg_min_z
,
441 context
->wg_max_x
, context
->wg_max_y
, context
->wg_max_z
,
442 sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, dp
->grid_size_x
,
443 dp
->grid_size_y
, dp
->grid_size_z
);
449 wi
.x
= wi
.y
= wi
.z
= 0;
450 wi
.launch_data
= context
;
452 #ifdef BENCHMARK_PHSA_RT
453 start_time
= clock ();
454 uint64_t wg_count
= 0;
457 size_t wg_size
= __hsail_workgroupsize (0, &wi
)
458 * __hsail_workgroupsize (1, &wi
)
459 * __hsail_workgroupsize (2, &wi
);
461 void *private_base_ptr
= NULL
;
462 if (dp
->private_segment_size
> 0
463 && posix_memalign (&private_base_ptr
, PRIVATE_SEGMENT_ALIGN
,
464 dp
->private_segment_size
* wg_size
)
466 phsa_fatal_error (3);
468 wg
.alloca_stack_p
= dp
->private_segment_size
* wg_size
;
469 wg
.alloca_frame_p
= wg
.alloca_stack_p
;
471 wg
.private_base_ptr
= private_base_ptr
;
472 wg
.group_base_ptr
= group_base_ptr
;
475 printf ("priv seg size %u wg_size %lu @ %p\n", dp
->private_segment_size
,
476 wg_size
, private_base_ptr
);
479 for (wg_z
= context
->wg_min_z
; wg_z
< context
->wg_max_z
; ++wg_z
)
480 for (wg_y
= context
->wg_min_y
; wg_y
< context
->wg_max_y
; ++wg_y
)
481 for (wg_x
= context
->wg_min_x
; wg_x
< context
->wg_max_x
; ++wg_x
)
487 context
->kernel (context
->kernarg_addr
, &wi
, group_base_ptr
,
490 #if defined (BENCHMARK_PHSA_RT)
492 if (wg_count
% 1000000 == 0)
494 clock_t spent_time
= clock () - start_time
;
495 uint64_t wi_count
= wg_x
* sat_wg_size_x
+ wg_y
* sat_wg_size_y
496 + wg_z
* sat_wg_size_z
;
497 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
498 double wis_per_sec
= wi_count
/ spent_time_sec
;
499 uint64_t eta_sec
= (wi_total
- wi_count
) / wis_per_sec
;
501 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
502 wi_count
, (uint64_t) spent_time_sec
,
503 (uint64_t) wis_per_sec
, (uint64_t) eta_sec
);
508 #ifdef BENCHMARK_PHSA_RT
509 clock_t spent_time
= clock () - start_time
;
510 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
511 double wis_per_sec
= wi_total
/ spent_time_sec
;
513 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total
,
514 (uint64_t) spent_time_sec
, (uint64_t) wis_per_sec
);
517 if (dp
->group_segment_size
> 0)
518 free (group_base_ptr
);
520 free (private_base_ptr
);
521 private_base_ptr
= NULL
;
524 /* gccbrig generates the following from each HSAIL kernel:
526 1) The actual kernel function (a single work-item kernel or a work-group
527 function) generated from HSAIL (BRIG).
529 static void _Kernel (void* args, void* context, void* group_base_ptr)
534 2) A public facing kernel function that is called from the PHSA runtime:
536 a) A single work-item function (that requires fibers for multi-WI):
538 void Kernel (void* context)
540 __launch_launch_kernel (_Kernel, context);
545 b) a when gccbrig could generate a work-group function:
547 void Kernel (void* context)
549 __hsail_launch_wg_function (_Kernel, context);
556 __hsail_launch_kernel (gccbrigKernelFunc kernel
, PHSAKernelLaunchData
*context
,
557 void *group_base_ptr
)
559 context
->kernel
= kernel
;
560 phsa_spawn_work_items (context
, group_base_ptr
);
565 __hsail_launch_wg_function (gccbrigKernelFunc kernel
,
566 PHSAKernelLaunchData
*context
, void *group_base_ptr
)
568 context
->kernel
= kernel
;
569 phsa_execute_work_groups (context
, group_base_ptr
);
573 __hsail_workitemabsid (uint32_t dim
, PHSAWorkItem
*context
)
575 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
582 /* Overflow semantics in the case of WG dim > grid dim. */
583 id
= ((uint64_t) context
->wg
->x
* dp
->workgroup_size_x
+ context
->x
)
587 id
= ((uint64_t) context
->wg
->y
* dp
->workgroup_size_y
+ context
->y
)
591 id
= ((uint64_t) context
->wg
->z
* dp
->workgroup_size_z
+ context
->z
)
599 __hsail_workitemabsid_u64 (uint32_t dim
, PHSAWorkItem
*context
)
601 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
608 /* Overflow semantics in the case of WG dim > grid dim. */
609 id
= ((uint64_t) context
->wg
->x
* dp
->workgroup_size_x
+ context
->x
)
613 id
= ((uint64_t) context
->wg
->y
* dp
->workgroup_size_y
+ context
->y
)
617 id
= ((uint64_t) context
->wg
->z
* dp
->workgroup_size_z
+ context
->z
)
626 __hsail_workitemid (uint32_t dim
, PHSAWorkItem
*context
)
628 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
629 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
631 /* The number of dimensions is in the two least significant bits. */
632 int dims
= dp
->setup
& 0x3;
642 id
= dims
< 2 ? 0 : c
->y
;
645 id
= dims
< 3 ? 0 : c
->z
;
652 __hsail_gridgroups (uint32_t dim
, PHSAWorkItem
*context
)
654 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
655 int dims
= dp
->setup
& 0x3;
662 id
= (dp
->grid_size_x
+ dp
->workgroup_size_x
- 1) / dp
->workgroup_size_x
;
665 id
= dims
< 2 ? 1 : (dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
666 / dp
->workgroup_size_y
;
669 id
= dims
< 3 ? 1 : (dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
670 / dp
->workgroup_size_z
;
677 __hsail_workitemflatid (PHSAWorkItem
*c
)
679 hsa_kernel_dispatch_packet_t
*dp
= c
->launch_data
->dp
;
681 return c
->x
+ c
->y
* dp
->workgroup_size_x
682 + c
->z
* dp
->workgroup_size_x
* dp
->workgroup_size_y
;
686 __hsail_currentworkitemflatid (PHSAWorkItem
*c
)
688 hsa_kernel_dispatch_packet_t
*dp
= c
->launch_data
->dp
;
690 return c
->x
+ c
->y
* __hsail_currentworkgroupsize (0, c
)
691 + c
->z
* __hsail_currentworkgroupsize (0, c
)
692 * __hsail_currentworkgroupsize (1, c
);
696 __hsail_setworkitemid (uint32_t dim
, uint32_t id
, PHSAWorkItem
*context
)
714 __hsail_workitemflatabsid_u64 (PHSAWorkItem
*context
)
716 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
717 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
719 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
720 uint64_t id0
= __hsail_workitemabsid (0, context
);
721 uint64_t id1
= __hsail_workitemabsid (1, context
);
722 uint64_t id2
= __hsail_workitemabsid (2, context
);
724 uint64_t max0
= dp
->grid_size_x
;
725 uint64_t max1
= dp
->grid_size_y
;
726 uint64_t id
= id0
+ id1
* max0
+ id2
* max0
* max1
;
732 __hsail_workitemflatabsid_u32 (PHSAWorkItem
*context
)
734 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
735 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
737 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
738 uint64_t id0
= __hsail_workitemabsid (0, context
);
739 uint64_t id1
= __hsail_workitemabsid (1, context
);
740 uint64_t id2
= __hsail_workitemabsid (2, context
);
742 uint64_t max0
= dp
->grid_size_x
;
743 uint64_t max1
= dp
->grid_size_y
;
744 uint64_t id
= id0
+ id1
* max0
+ id2
* max0
* max1
;
745 return (uint32_t) id
;
749 __hsail_currentworkgroupsize (uint32_t dim
, PHSAWorkItem
*wi
)
751 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
752 uint32_t wg_size
= 0;
757 if ((uint64_t) wi
->wg
->x
< dp
->grid_size_x
/ dp
->workgroup_size_x
)
758 wg_size
= dp
->workgroup_size_x
; /* Full WG. */
760 wg_size
= dp
->grid_size_x
% dp
->workgroup_size_x
; /* Partial WG. */
763 if ((uint64_t) wi
->wg
->y
< dp
->grid_size_y
/ dp
->workgroup_size_y
)
764 wg_size
= dp
->workgroup_size_y
; /* Full WG. */
766 wg_size
= dp
->grid_size_y
% dp
->workgroup_size_y
; /* Partial WG. */
769 if ((uint64_t) wi
->wg
->z
< dp
->grid_size_z
/ dp
->workgroup_size_z
)
770 wg_size
= dp
->workgroup_size_z
; /* Full WG. */
772 wg_size
= dp
->grid_size_z
% dp
->workgroup_size_z
; /* Partial WG. */
779 __hsail_workgroupsize (uint32_t dim
, PHSAWorkItem
*wi
)
781 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
786 return dp
->workgroup_size_x
;
788 return dp
->workgroup_size_y
;
790 return dp
->workgroup_size_z
;
795 __hsail_gridsize (uint32_t dim
, PHSAWorkItem
*wi
)
797 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
802 return dp
->grid_size_x
;
804 return dp
->grid_size_y
;
806 return dp
->grid_size_z
;
811 __hsail_workgroupid (uint32_t dim
, PHSAWorkItem
*wi
)
826 __hsail_dim (PHSAWorkItem
*wi
)
828 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
829 return dp
->setup
& 0x3;
833 __hsail_packetid (PHSAWorkItem
*wi
)
835 return wi
->launch_data
->packet_id
;
839 __hsail_packetcompletionsig_sig32 (PHSAWorkItem
*wi
)
841 return (uint32_t) wi
->launch_data
->dp
->completion_signal
.handle
;
845 __hsail_packetcompletionsig_sig64 (PHSAWorkItem
*wi
)
847 return (uint64_t) (wi
->launch_data
->dp
->completion_signal
.handle
);
852 __hsail_barrier (PHSAWorkItem
*wi
)
854 fiber_barrier_reach ((fiber_barrier_t
*) wi
->launch_data
->wg_sync_barrier
);
858 /* Return a 32b private segment address that points to a dynamically
859 allocated chunk of 'size' with 'align'.
861 Allocates the space from the end of the private segment allocated
862 for the whole work group. In implementations with separate private
863 memories per WI, we will need to have a stack pointer per WI. But in
864 the current implementation, the segment is shared, so we possibly
865 save some space in case all WIs do not call the alloca.
867 The "alloca frames" are organized as follows:
869 wg->alloca_stack_p points to the last allocated data (initially
870 outside the private segment)
871 wg->alloca_frame_p points to the first address _outside_ the current
872 function's allocations (initially to the same as alloca_stack_p)
874 The data is allocated downwards from the end of the private segment.
876 In the beginning of a new function which has allocas, a new alloca
877 frame is pushed which adds the current alloca_frame_p (the current
878 function's frame starting point) to the top of the alloca stack and
879 alloca_frame_p is set to the current stack position.
881 At the exit points of a function with allocas, the alloca frame
882 is popped before returning. This involves popping the alloca_frame_p
883 to the one of the previous function in the call stack, and alloca_stack_p
884 similarly, to the position of the last word alloca'd by the previous
889 __hsail_alloca (uint32_t size
, uint32_t align
, PHSAWorkItem
*wi
)
891 volatile PHSAWorkGroup
*wg
= wi
->wg
;
892 uint32_t new_pos
= wg
->alloca_stack_p
- size
;
893 while (new_pos
% align
!= 0)
895 wg
->alloca_stack_p
= new_pos
;
898 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size
, align
,
899 wg
->alloca_stack_p
, wg
->alloca_frame_p
);
904 /* Initializes a new "alloca frame" in the private segment.
905 This should be called at all the function entry points in case
906 the function contains at least one call to alloca. */
909 __hsail_alloca_push_frame (PHSAWorkItem
*wi
)
911 volatile PHSAWorkGroup
*wg
= wi
->wg
;
913 /* Store the alloca_frame_p without any alignment padding so
914 we know exactly where the previous frame ended after popping
917 printf ("--- push frame ");
919 uint32_t last_word_offs
= __hsail_alloca (4, 1, wi
);
920 memcpy (wg
->private_base_ptr
+ last_word_offs
,
921 (const void *) &wg
->alloca_frame_p
, 4);
922 wg
->alloca_frame_p
= last_word_offs
;
925 printf ("--- sp @%u fp @%u\n", wg
->alloca_stack_p
, wg
->alloca_frame_p
);
929 /* Frees the current "alloca frame" and restores the frame
931 This should be called at all the function return points in case
932 the function contains at least one call to alloca. Restores the
933 alloca stack to the condition it was before pushing the frame
936 __hsail_alloca_pop_frame (PHSAWorkItem
*wi
)
938 volatile PHSAWorkGroup
*wg
= wi
->wg
;
940 wg
->alloca_stack_p
= wg
->alloca_frame_p
;
941 memcpy ((void *) &wg
->alloca_frame_p
,
942 (const void *) (wg
->private_base_ptr
+ wg
->alloca_frame_p
), 4);
943 /* Now frame_p points to the beginning of the previous function's
944 frame and stack_p to its end. */
946 wg
->alloca_stack_p
+= 4;
949 printf ("--- pop frame sp @%u fp @%u\n", wg
->alloca_stack_p
,