1 /* workitems.c -- The main runtime entry that performs work-item execution in
2 various ways and the builtin functions closely related to the
5 Copyright (C) 2015-2018 Free Software Foundation, Inc.
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
7 for General Processor Tech.
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files
11 (the "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
29 /* The fiber based multiple work-item work-group execution uses ucontext
30 based user mode threading. However, if gccbrig is able to optimize the
31 kernel to a much faster work-group function that implements the multiple
32 WI execution using loops instead of fibers requiring slow context switches,
33 the fiber-based implementation won't be called.
40 #include "workitems.h"
47 #ifdef BENCHMARK_PHSA_RT
51 static uint64_t wi_count
= 0;
52 static uint64_t wis_skipped
= 0;
53 static uint64_t wi_total
= 0;
54 static clock_t start_time
;
62 #define PRIVATE_SEGMENT_ALIGN 256
63 #define FIBER_STACK_SIZE (64*1024)
64 #define GROUP_SEGMENT_ALIGN 256
66 /* Preserve this amount of additional space in the alloca stack as we need to
67 store the alloca frame pointer to the alloca frame, thus must preserve
68 space for it. This thus supports at most 1024 functions with allocas in
70 #define ALLOCA_OVERHEAD 1024*4
72 uint32_t __hsail_workitemabsid (uint32_t dim
, PHSAWorkItem
*context
);
74 uint32_t __hsail_workitemid (uint32_t dim
, PHSAWorkItem
*context
);
76 uint32_t __hsail_gridgroups (uint32_t dim
, PHSAWorkItem
*context
);
78 uint32_t __hsail_currentworkgroupsize (uint32_t dim
, PHSAWorkItem
*wi
);
80 uint32_t __hsail_workgroupsize (uint32_t dim
, PHSAWorkItem
*wi
);
83 phsa_fatal_error (int code
)
89 /* ucontext-based work-item thread implementation. Runs all work-items in
93 phsa_work_item_thread (int arg0
, int arg1
)
95 void *arg
= fiber_int_args_to_ptr (arg0
, arg1
);
97 PHSAWorkItem
*wi
= (PHSAWorkItem
*) arg
;
98 volatile PHSAWorkGroup
*wg
= wi
->wg
;
99 PHSAKernelLaunchData
*l_data
= wi
->launch_data
;
104 = fiber_barrier_reach ((fiber_barrier_t
*) l_data
->wg_start_barrier
);
106 /* At this point the threads can assume that either more_wgs is 0 or
107 the current_work_group_* is set to point to the WG executed next. */
108 if (!wi
->wg
->more_wgs
)
115 wi
->cur_wg_size_x
= __hsail_currentworkgroupsize (0, wi
);
116 wi
->cur_wg_size_y
= __hsail_currentworkgroupsize (1, wi
);
117 wi
->cur_wg_size_z
= __hsail_currentworkgroupsize (2, wi
);
121 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
122 wi
->x
, wi
->y
, wi
->z
, wi
->group_x
, wi
->group_y
, wi
->group_z
,
123 l_data
->wg_max_x
, l_data
->wg_max_y
, l_data
->wg_max_z
);
126 if (wi
->x
< __hsail_currentworkgroupsize (0, wi
)
127 && wi
->y
< __hsail_currentworkgroupsize (1, wi
)
128 && wi
->z
< __hsail_currentworkgroupsize (2, wi
))
130 l_data
->kernel (l_data
->kernarg_addr
, wi
, wg
->group_base_ptr
,
131 wg
->initial_group_offset
, wg
->private_base_ptr
);
135 #ifdef BENCHMARK_PHSA_RT
142 printf ("skipped (partial WG).\n");
144 #ifdef BENCHMARK_PHSA_RT
150 = fiber_barrier_reach ((fiber_barrier_t
*)
151 l_data
->wg_completion_barrier
);
153 /* The first thread updates the WG to execute next etc. */
157 #ifdef EXECUTE_WGS_BACKWARDS
158 if (wg
->x
== l_data
->wg_min_x
)
160 wg
->x
= l_data
->wg_max_x
- 1;
161 if (wg
->y
== l_data
->wg_min_y
)
163 wg
->y
= l_data
->wg_max_y
- 1;
164 if (wg
->z
== l_data
->wg_min_z
)
175 if (wg
->x
+ 1 >= l_data
->wg_max_x
)
177 wg
->x
= l_data
->wg_min_x
;
178 if (wg
->y
+ 1 >= l_data
->wg_max_y
)
180 wg
->y
= l_data
->wg_min_y
;
181 if (wg
->z
+ 1 >= l_data
->wg_max_z
)
196 wi
->cur_wg_size_x
= __hsail_currentworkgroupsize (0, wi
);
197 wi
->cur_wg_size_y
= __hsail_currentworkgroupsize (1, wi
);
198 wi
->cur_wg_size_z
= __hsail_currentworkgroupsize (2, wi
);
200 /* Reinitialize the work-group barrier according to the new WG's
201 size, which might not be the same as the previous ones, due
203 size_t wg_size
= __hsail_currentworkgroupsize (0, wi
)
204 * __hsail_currentworkgroupsize (1, wi
)
205 * __hsail_currentworkgroupsize (2, wi
);
208 printf ("Reinitializing the WG barrier to %lu.\n", wg_size
);
210 fiber_barrier_init ((fiber_barrier_t
*)
211 wi
->launch_data
->wg_sync_barrier
,
214 #ifdef BENCHMARK_PHSA_RT
215 if (wi_count
% 1000 == 0)
217 clock_t spent_time
= clock () - start_time
;
218 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
219 double wis_per_sec
= wi_count
/ spent_time_sec
;
221 = (wi_total
- wi_count
- wis_skipped
) / wis_per_sec
;
223 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
225 wi_count
, wis_skipped
, (uint64_t) spent_time_sec
,
226 (uint64_t) wis_per_sec
, (uint64_t) eta_sec
);
237 #define MIN(a, b) ((a < b) ? a : b)
238 #define MAX(a, b) ((a > b) ? a : b)
241 /* Spawns a given number of work-items to execute a set of work-groups,
242 blocks until their completion. */
245 phsa_execute_wi_gang (PHSAKernelLaunchData
*context
, void *group_base_ptr
,
246 uint32_t group_local_offset
, size_t wg_size_x
,
247 size_t wg_size_y
, size_t wg_size_z
)
249 PHSAWorkItem
*wi_threads
= NULL
;
251 size_t flat_wi_id
= 0, x
, y
, z
, max_x
, max_y
, max_z
;
252 uint32_t group_x
, group_y
, group_z
;
253 fiber_barrier_t wg_start_barrier
;
254 fiber_barrier_t wg_completion_barrier
;
255 fiber_barrier_t wg_sync_barrier
;
257 max_x
= wg_size_x
== 0 ? 1 : wg_size_x
;
258 max_y
= wg_size_y
== 0 ? 1 : wg_size_y
;
259 max_z
= wg_size_z
== 0 ? 1 : wg_size_z
;
261 size_t wg_size
= max_x
* max_y
* max_z
;
262 if (wg_size
> PHSA_MAX_WG_SIZE
)
263 phsa_fatal_error (2);
265 wg
.private_segment_total_size
= context
->dp
->private_segment_size
* wg_size
;
266 if (wg
.private_segment_total_size
> 0
267 && posix_memalign (&wg
.private_base_ptr
, PRIVATE_SEGMENT_ALIGN
,
268 wg
.private_segment_total_size
)
270 phsa_fatal_error (3);
272 wg
.alloca_stack_p
= wg
.private_segment_total_size
+ ALLOCA_OVERHEAD
;
273 wg
.alloca_frame_p
= wg
.alloca_stack_p
;
274 wg
.initial_group_offset
= group_local_offset
;
276 #ifdef EXECUTE_WGS_BACKWARDS
277 group_x
= context
->wg_max_x
- 1;
278 group_y
= context
->wg_max_y
- 1;
279 group_z
= context
->wg_max_z
- 1;
281 group_x
= context
->wg_min_x
;
282 group_y
= context
->wg_min_y
;
283 group_z
= context
->wg_min_z
;
286 fiber_barrier_init (&wg_sync_barrier
, wg_size
);
287 fiber_barrier_init (&wg_start_barrier
, wg_size
);
288 fiber_barrier_init (&wg_completion_barrier
, wg_size
);
290 context
->wg_start_barrier
= &wg_start_barrier
;
291 context
->wg_sync_barrier
= &wg_sync_barrier
;
292 context
->wg_completion_barrier
= &wg_completion_barrier
;
295 wg
.group_base_ptr
= group_base_ptr
;
297 #ifdef BENCHMARK_PHSA_RT
300 start_time
= clock ();
302 wi_threads
= malloc (sizeof (PHSAWorkItem
) * max_x
* max_y
* max_z
);
303 for (x
= 0; x
< max_x
; ++x
)
304 for (y
= 0; y
< max_y
; ++y
)
305 for (z
= 0; z
< max_z
; ++z
)
307 PHSAWorkItem
*wi
= &wi_threads
[flat_wi_id
];
308 wi
->launch_data
= context
;
311 wg
.x
= wi
->group_x
= group_x
;
312 wg
.y
= wi
->group_y
= group_y
;
313 wg
.z
= wi
->group_z
= group_z
;
315 wi
->wg_size_x
= context
->dp
->workgroup_size_x
;
316 wi
->wg_size_y
= context
->dp
->workgroup_size_y
;
317 wi
->wg_size_z
= context
->dp
->workgroup_size_z
;
319 wi
->cur_wg_size_x
= __hsail_currentworkgroupsize (0, wi
);
320 wi
->cur_wg_size_y
= __hsail_currentworkgroupsize (1, wi
);
321 wi
->cur_wg_size_z
= __hsail_currentworkgroupsize (2, wi
);
327 /* TODO: set the stack size according to the private
328 segment size. Too big stack consumes huge amount of
329 memory in case of huge number of WIs and a too small stack
330 will fail in mysterious and potentially dangerous ways. */
332 fiber_init (&wi
->fiber
, phsa_work_item_thread
, wi
,
333 FIBER_STACK_SIZE
, PRIVATE_SEGMENT_ALIGN
);
340 fiber_join (&wi_threads
[flat_wi_id
].fiber
);
342 while (flat_wi_id
> 0);
344 if (wg
.private_segment_total_size
> 0)
345 free (wg
.private_base_ptr
);
350 /* Spawn the work-item threads to execute work-groups and let
351 them execute all the WGs, including a potential partial WG. */
354 phsa_spawn_work_items (PHSAKernelLaunchData
*context
, void *group_base_ptr
,
355 uint32_t group_local_offset
)
357 hsa_kernel_dispatch_packet_t
*dp
= context
->dp
;
360 context
->group_segment_start_addr
= (size_t) group_base_ptr
;
362 /* HSA seems to allow the WG size to be larger than the grid size. We need to
363 saturate the effective WG size to the grid size to prevent the extra WIs
365 size_t sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, sat_wg_size
;
366 sat_wg_size_x
= MIN (dp
->workgroup_size_x
, dp
->grid_size_x
);
367 sat_wg_size_y
= MIN (dp
->workgroup_size_y
, dp
->grid_size_y
);
368 sat_wg_size_z
= MIN (dp
->workgroup_size_z
, dp
->grid_size_z
);
369 sat_wg_size
= sat_wg_size_x
* sat_wg_size_y
* sat_wg_size_z
;
371 #ifdef BENCHMARK_PHSA_RT
372 wi_total
= (uint64_t) dp
->grid_size_x
373 * (dp
->grid_size_y
> 0 ? dp
->grid_size_y
: 1)
374 * (dp
->grid_size_z
> 0 ? dp
->grid_size_z
: 1);
377 /* For now execute all work groups in a single coarse thread (does not utilize
378 multicore/multithread). */
379 context
->wg_min_x
= context
->wg_min_y
= context
->wg_min_z
= 0;
381 int dims
= dp
->setup
& 0x3;
383 context
->wg_max_x
= ((uint64_t) dp
->grid_size_x
+ dp
->workgroup_size_x
- 1)
384 / dp
->workgroup_size_x
;
387 = dims
< 2 ? 1 : ((uint64_t) dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
388 / dp
->workgroup_size_y
;
391 = dims
< 3 ? 1 : ((uint64_t) dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
392 / dp
->workgroup_size_z
;
395 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
396 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
397 context
->wg_min_x
, context
->wg_min_y
, context
->wg_min_z
,
398 context
->wg_max_x
, context
->wg_max_y
, context
->wg_max_z
,
399 sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, dp
->grid_size_x
,
400 dp
->grid_size_y
, dp
->grid_size_z
);
403 phsa_execute_wi_gang (context
, group_base_ptr
, group_local_offset
,
404 sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
);
408 /* Executes the given work-group function for all work groups in the grid.
410 A work-group function is a version of the original kernel which executes
411 the kernel for all work-items in a work-group. It is produced by gccbrig
412 if it can handle the kernel's barrier usage and is much faster way to
413 execute massive numbers of work-items in a non-SPMD machine than fibers
414 (easily 100x faster). */
416 phsa_execute_work_groups (PHSAKernelLaunchData
*context
, void *group_base_ptr
,
417 uint32_t group_local_offset
)
419 hsa_kernel_dispatch_packet_t
*dp
= context
->dp
;
420 size_t x
, y
, z
, wg_x
, wg_y
, wg_z
;
422 context
->group_segment_start_addr
= (size_t) group_base_ptr
;
424 /* HSA seems to allow the WG size to be larger than the grid size. We need
425 to saturate the effective WG size to the grid size to prevent the extra WIs
427 size_t sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, sat_wg_size
;
428 sat_wg_size_x
= MIN (dp
->workgroup_size_x
, dp
->grid_size_x
);
429 sat_wg_size_y
= MIN (dp
->workgroup_size_y
, dp
->grid_size_y
);
430 sat_wg_size_z
= MIN (dp
->workgroup_size_z
, dp
->grid_size_z
);
431 sat_wg_size
= sat_wg_size_x
* sat_wg_size_y
* sat_wg_size_z
;
433 #ifdef BENCHMARK_PHSA_RT
434 wi_total
= (uint64_t) dp
->grid_size_x
435 * (dp
->grid_size_y
> 0 ? dp
->grid_size_y
: 1)
436 * (dp
->grid_size_z
> 0 ? dp
->grid_size_z
: 1);
439 context
->wg_min_x
= context
->wg_min_y
= context
->wg_min_z
= 0;
441 int dims
= dp
->setup
& 0x3;
443 context
->wg_max_x
= ((uint64_t) dp
->grid_size_x
+ dp
->workgroup_size_x
- 1)
444 / dp
->workgroup_size_x
;
447 = dims
< 2 ? 1 : ((uint64_t) dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
448 / dp
->workgroup_size_y
;
451 = dims
< 3 ? 1 : ((uint64_t) dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
452 / dp
->workgroup_size_z
;
455 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
456 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
457 context
->wg_min_x
, context
->wg_min_y
, context
->wg_min_z
,
458 context
->wg_max_x
, context
->wg_max_y
, context
->wg_max_z
,
459 sat_wg_size_x
, sat_wg_size_y
, sat_wg_size_z
, dp
->grid_size_x
,
460 dp
->grid_size_y
, dp
->grid_size_z
);
466 wi
.x
= wi
.y
= wi
.z
= 0;
467 wi
.launch_data
= context
;
469 #ifdef BENCHMARK_PHSA_RT
470 start_time
= clock ();
471 uint64_t wg_count
= 0;
474 size_t wg_size
= __hsail_workgroupsize (0, &wi
)
475 * __hsail_workgroupsize (1, &wi
)
476 * __hsail_workgroupsize (2, &wi
);
478 void *private_base_ptr
= NULL
;
479 if (dp
->private_segment_size
> 0
480 && posix_memalign (&private_base_ptr
, PRIVATE_SEGMENT_ALIGN
,
481 dp
->private_segment_size
* wg_size
)
483 phsa_fatal_error (3);
485 wg
.alloca_stack_p
= dp
->private_segment_size
* wg_size
+ ALLOCA_OVERHEAD
;
486 wg
.alloca_frame_p
= wg
.alloca_stack_p
;
488 wg
.private_base_ptr
= private_base_ptr
;
489 wg
.group_base_ptr
= group_base_ptr
;
492 printf ("priv seg size %u wg_size %lu @ %p\n", dp
->private_segment_size
,
493 wg_size
, private_base_ptr
);
496 for (wg_z
= context
->wg_min_z
; wg_z
< context
->wg_max_z
; ++wg_z
)
497 for (wg_y
= context
->wg_min_y
; wg_y
< context
->wg_max_y
; ++wg_y
)
498 for (wg_x
= context
->wg_min_x
; wg_x
< context
->wg_max_x
; ++wg_x
)
504 wi
.wg_size_x
= context
->dp
->workgroup_size_x
;
505 wi
.wg_size_y
= context
->dp
->workgroup_size_y
;
506 wi
.wg_size_z
= context
->dp
->workgroup_size_z
;
508 wi
.cur_wg_size_x
= __hsail_currentworkgroupsize (0, &wi
);
509 wi
.cur_wg_size_y
= __hsail_currentworkgroupsize (1, &wi
);
510 wi
.cur_wg_size_z
= __hsail_currentworkgroupsize (2, &wi
);
512 context
->kernel (context
->kernarg_addr
, &wi
, group_base_ptr
,
513 group_local_offset
, private_base_ptr
);
515 #if defined (BENCHMARK_PHSA_RT)
517 if (wg_count
% 1000000 == 0)
519 clock_t spent_time
= clock () - start_time
;
520 uint64_t wi_count
= wg_x
* sat_wg_size_x
+ wg_y
* sat_wg_size_y
521 + wg_z
* sat_wg_size_z
;
522 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
523 double wis_per_sec
= wi_count
/ spent_time_sec
;
524 uint64_t eta_sec
= (wi_total
- wi_count
) / wis_per_sec
;
526 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
527 wi_count
, (uint64_t) spent_time_sec
,
528 (uint64_t) wis_per_sec
, (uint64_t) eta_sec
);
533 #ifdef BENCHMARK_PHSA_RT
534 clock_t spent_time
= clock () - start_time
;
535 double spent_time_sec
= (double) spent_time
/ CLOCKS_PER_SEC
;
536 double wis_per_sec
= wi_total
/ spent_time_sec
;
538 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total
,
539 (uint64_t) spent_time_sec
, (uint64_t) wis_per_sec
);
541 free (private_base_ptr
);
542 private_base_ptr
= NULL
;
545 /* gccbrig generates the following from each HSAIL kernel:
547 1) The actual kernel function (a single work-item kernel or a work-group
548 function) generated from HSAIL (BRIG).
550 static void _Kernel (void* args, void* context, void* group_base_ptr)
555 2) A public facing kernel function that is called from the PHSA runtime:
557 a) A single work-item function (that requires fibers for multi-WI):
559 void Kernel (void* context)
561 __launch_launch_kernel (_Kernel, context);
566 b) a when gccbrig could generate a work-group function:
568 void Kernel (void* context)
570 __hsail_launch_wg_function (_Kernel, context);
577 __hsail_launch_kernel (gccbrigKernelFunc kernel
, PHSAKernelLaunchData
*context
,
578 void *group_base_ptr
, uint32_t group_local_offset
)
580 context
->kernel
= kernel
;
581 phsa_spawn_work_items (context
, group_base_ptr
, group_local_offset
);
586 __hsail_launch_wg_function (gccbrigKernelFunc kernel
,
587 PHSAKernelLaunchData
*context
, void *group_base_ptr
,
588 uint32_t group_local_offset
)
590 context
->kernel
= kernel
;
591 phsa_execute_work_groups (context
, group_base_ptr
, group_local_offset
);
595 __hsail_workitemabsid (uint32_t dim
, PHSAWorkItem
*context
)
597 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
604 /* Overflow semantics in the case of WG dim > grid dim. */
605 id
= ((uint64_t) context
->group_x
* dp
->workgroup_size_x
+ context
->x
)
609 id
= ((uint64_t) context
->group_y
* dp
->workgroup_size_y
+ context
->y
)
613 id
= ((uint64_t) context
->group_z
* dp
->workgroup_size_z
+ context
->z
)
621 __hsail_workitemabsid_u64 (uint32_t dim
, PHSAWorkItem
*context
)
623 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
630 /* Overflow semantics in the case of WG dim > grid dim. */
631 id
= ((uint64_t) context
->group_x
* dp
->workgroup_size_x
+ context
->x
)
635 id
= ((uint64_t) context
->group_y
* dp
->workgroup_size_y
+ context
->y
)
639 id
= ((uint64_t) context
->group_z
* dp
->workgroup_size_z
+ context
->z
)
648 __hsail_workitemid (uint32_t dim
, PHSAWorkItem
*context
)
650 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
651 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
653 /* The number of dimensions is in the two least significant bits. */
654 int dims
= dp
->setup
& 0x3;
664 id
= dims
< 2 ? 0 : c
->y
;
667 id
= dims
< 3 ? 0 : c
->z
;
674 __hsail_gridgroups (uint32_t dim
, PHSAWorkItem
*context
)
676 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
677 int dims
= dp
->setup
& 0x3;
684 id
= (dp
->grid_size_x
+ dp
->workgroup_size_x
- 1) / dp
->workgroup_size_x
;
687 id
= dims
< 2 ? 1 : (dp
->grid_size_y
+ dp
->workgroup_size_y
- 1)
688 / dp
->workgroup_size_y
;
691 id
= dims
< 3 ? 1 : (dp
->grid_size_z
+ dp
->workgroup_size_z
- 1)
692 / dp
->workgroup_size_z
;
699 __hsail_workitemflatid (PHSAWorkItem
*c
)
701 hsa_kernel_dispatch_packet_t
*dp
= c
->launch_data
->dp
;
703 return c
->x
+ c
->y
* dp
->workgroup_size_x
704 + c
->z
* dp
->workgroup_size_x
* dp
->workgroup_size_y
;
708 __hsail_currentworkitemflatid (PHSAWorkItem
*c
)
710 hsa_kernel_dispatch_packet_t
*dp
= c
->launch_data
->dp
;
712 return c
->x
+ c
->y
* __hsail_currentworkgroupsize (0, c
)
713 + c
->z
* __hsail_currentworkgroupsize (0, c
)
714 * __hsail_currentworkgroupsize (1, c
);
718 __hsail_setworkitemid (uint32_t dim
, uint32_t id
, PHSAWorkItem
*context
)
736 __hsail_workitemflatabsid_u64 (PHSAWorkItem
*context
)
738 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
739 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
741 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
742 uint64_t id0
= __hsail_workitemabsid (0, context
);
743 uint64_t id1
= __hsail_workitemabsid (1, context
);
744 uint64_t id2
= __hsail_workitemabsid (2, context
);
746 uint64_t max0
= dp
->grid_size_x
;
747 uint64_t max1
= dp
->grid_size_y
;
748 uint64_t id
= id0
+ id1
* max0
+ id2
* max0
* max1
;
754 __hsail_workitemflatabsid_u32 (PHSAWorkItem
*context
)
756 PHSAWorkItem
*c
= (PHSAWorkItem
*) context
;
757 hsa_kernel_dispatch_packet_t
*dp
= context
->launch_data
->dp
;
759 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
760 uint64_t id0
= __hsail_workitemabsid (0, context
);
761 uint64_t id1
= __hsail_workitemabsid (1, context
);
762 uint64_t id2
= __hsail_workitemabsid (2, context
);
764 uint64_t max0
= dp
->grid_size_x
;
765 uint64_t max1
= dp
->grid_size_y
;
766 uint64_t id
= id0
+ id1
* max0
+ id2
* max0
* max1
;
767 return (uint32_t) id
;
771 __hsail_currentworkgroupsize (uint32_t dim
, PHSAWorkItem
*wi
)
773 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
774 uint32_t wg_size
= 0;
779 if ((uint64_t) wi
->group_x
< dp
->grid_size_x
/ dp
->workgroup_size_x
)
780 wg_size
= dp
->workgroup_size_x
; /* Full WG. */
782 wg_size
= dp
->grid_size_x
% dp
->workgroup_size_x
; /* Partial WG. */
785 if ((uint64_t) wi
->group_y
< dp
->grid_size_y
/ dp
->workgroup_size_y
)
786 wg_size
= dp
->workgroup_size_y
; /* Full WG. */
788 wg_size
= dp
->grid_size_y
% dp
->workgroup_size_y
; /* Partial WG. */
791 if ((uint64_t) wi
->group_z
< dp
->grid_size_z
/ dp
->workgroup_size_z
)
792 wg_size
= dp
->workgroup_size_z
; /* Full WG. */
794 wg_size
= dp
->grid_size_z
% dp
->workgroup_size_z
; /* Partial WG. */
801 __hsail_workgroupsize (uint32_t dim
, PHSAWorkItem
*wi
)
803 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
808 return dp
->workgroup_size_x
;
810 return dp
->workgroup_size_y
;
812 return dp
->workgroup_size_z
;
817 __hsail_gridsize (uint32_t dim
, PHSAWorkItem
*wi
)
819 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
824 return dp
->grid_size_x
;
826 return dp
->grid_size_y
;
828 return dp
->grid_size_z
;
833 __hsail_workgroupid (uint32_t dim
, PHSAWorkItem
*wi
)
848 __hsail_dim (PHSAWorkItem
*wi
)
850 hsa_kernel_dispatch_packet_t
*dp
= wi
->launch_data
->dp
;
851 return dp
->setup
& 0x3;
855 __hsail_packetid (PHSAWorkItem
*wi
)
857 return wi
->launch_data
->packet_id
;
861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem
*wi
)
863 return (uint32_t) wi
->launch_data
->dp
->completion_signal
.handle
;
867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem
*wi
)
869 return (uint64_t) (wi
->launch_data
->dp
->completion_signal
.handle
);
874 __hsail_barrier (PHSAWorkItem
*wi
)
876 fiber_barrier_reach ((fiber_barrier_t
*) wi
->launch_data
->wg_sync_barrier
);
880 /* Return a 32b private segment address that points to a dynamically
881 allocated chunk of 'size' with 'align'.
883 Allocates the space from the end of the private segment allocated
884 for the whole work group. In implementations with separate private
885 memories per WI, we will need to have a stack pointer per WI. But in
886 the current implementation, the segment is shared, so we possibly
887 save some space in case all WIs do not call the alloca.
889 The "alloca frames" are organized as follows:
891 wg->alloca_stack_p points to the last allocated data (initially
892 outside the private segment)
893 wg->alloca_frame_p points to the first address _outside_ the current
894 function's allocations (initially to the same as alloca_stack_p)
896 The data is allocated downwards from the end of the private segment.
898 In the beginning of a new function which has allocas, a new alloca
899 frame is pushed which adds the current alloca_frame_p (the current
900 function's frame starting point) to the top of the alloca stack and
901 alloca_frame_p is set to the current stack position.
903 At the exit points of a function with allocas, the alloca frame
904 is popped before returning. This involves popping the alloca_frame_p
905 to the one of the previous function in the call stack, and alloca_stack_p
906 similarly, to the position of the last word alloca'd by the previous
911 __hsail_alloca (uint32_t size
, uint32_t align
, PHSAWorkItem
*wi
)
913 volatile PHSAWorkGroup
*wg
= wi
->wg
;
914 int64_t new_pos
= wg
->alloca_stack_p
- size
;
915 while (new_pos
% align
!= 0)
918 phsa_fatal_error (2);
920 wg
->alloca_stack_p
= new_pos
;
923 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size
, align
,
924 wg
->alloca_stack_p
, wg
->alloca_frame_p
);
929 /* Initializes a new "alloca frame" in the private segment.
930 This should be called at all the function entry points in case
931 the function contains at least one call to alloca. */
934 __hsail_alloca_push_frame (PHSAWorkItem
*wi
)
936 volatile PHSAWorkGroup
*wg
= wi
->wg
;
938 /* Store the alloca_frame_p without any alignment padding so
939 we know exactly where the previous frame ended after popping
942 printf ("--- push frame ");
944 uint32_t last_word_offs
= __hsail_alloca (4, 1, wi
);
945 memcpy (wg
->private_base_ptr
+ last_word_offs
,
946 (const void *) &wg
->alloca_frame_p
, 4);
947 wg
->alloca_frame_p
= last_word_offs
;
950 printf ("--- sp @%u fp @%u\n", wg
->alloca_stack_p
, wg
->alloca_frame_p
);
954 /* Frees the current "alloca frame" and restores the frame
956 This should be called at all the function return points in case
957 the function contains at least one call to alloca. Restores the
958 alloca stack to the condition it was before pushing the frame
961 __hsail_alloca_pop_frame (PHSAWorkItem
*wi
)
963 volatile PHSAWorkGroup
*wg
= wi
->wg
;
965 wg
->alloca_stack_p
= wg
->alloca_frame_p
;
966 memcpy ((void *) &wg
->alloca_frame_p
,
967 (const void *) (wg
->private_base_ptr
+ wg
->alloca_frame_p
), 4);
968 /* Now frame_p points to the beginning of the previous function's
969 frame and stack_p to its end. */
971 wg
->alloca_stack_p
+= 4;
974 printf ("--- pop frame sp @%u fp @%u\n", wg
->alloca_stack_p
,