From cf470867baedce340eff08a80afb6e9b3018d7ea Mon Sep 17 00:00:00 2001 From: Sven Verdoolaege Date: Wed, 2 Oct 2013 21:54:23 +0200 Subject: [PATCH] gpu_group_references: distinguish between shared_schedule and copy_schedule The initial mapping to shared memory is currently computed right outside the band that is mapped to threads. In a subsequent commit, it will be made possible to introduce this mapping at a higher level. In preparation, adjust gpu_group_references to take into account this distinction by introducing a copy_schedule that is used for determining the initial overlap between accesses. In particular, when a reference group can only be mapped to private memory, overlap will still be determined at the deepest level. Signed-off-by: Sven Verdoolaege --- gpu_group.c | 47 ++++++++++++++++++++++++++++++++++++++++------- gpu_group.h | 2 +- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/gpu_group.c b/gpu_group.c index efbb916..199e99f 100644 --- a/gpu_group.c +++ b/gpu_group.c @@ -439,18 +439,24 @@ static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) * kernel_depth is the schedule depth where the kernel launch will * be introduced, i.e., it is the depth of the band that is mapped * to blocks. + * shared_depth is the schedule depth at which the copying to/from + * shared memory is computed. The copy operation may then + * later be hoisted to a higher level. * thread_depth is the schedule depth where the thread mark is located, * i.e., it is the depth of the band that is mapped to threads and also - * the schedule depth at which the copying to/from shared/private memory + * the schedule depth at which the copying to/from private memory * is computed. The copy operation may then later be hoisted to * a higher level. + * Currently, shared_depth is equal to thread_depth. * n_thread is the number of schedule dimensions in the band that * is mapped to threads. * privatization lives in the range of thread_sched (i.e., it is * of dimension thread_depth + n_thread) and encodes the mapping * to thread identifiers (as parameters). * host_sched contains the kernel_depth dimensions of the host schedule. - * shared_sched contains the first thread_depth dimensions of the + * shared_sched contains the first shared_depth dimensions of the + * kernel schedule. + * copy_sched contains the first thread_depth dimensions of the * kernel schedule. * thread_sched contains the first (thread_depth + n_thread) dimensions * of the kernel schedule. @@ -462,11 +468,13 @@ static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) struct gpu_group_data { struct ppcg_scop *scop; int kernel_depth; + int shared_depth; int thread_depth; int n_thread; isl_set *privatization; isl_union_map *host_sched; isl_union_map *shared_sched; + isl_union_map *copy_sched; isl_union_map *thread_sched; isl_union_map *full_sched; }; @@ -813,7 +821,7 @@ static int populate_array_references(struct gpu_local_array_info *local, { int i; int n; - isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched); + isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched); n = 0; for (i = 0; i < local->array->n_ref; ++i) { @@ -825,7 +833,7 @@ static int populate_array_references(struct gpu_local_array_info *local, map = isl_map_copy(access->access); umap = isl_union_map_from_map(map); umap = isl_union_map_apply_domain(umap, - isl_union_map_copy(data->shared_sched)); + isl_union_map_copy(data->copy_sched)); if (isl_union_map_is_empty(umap)) { isl_union_map_free(umap); @@ -874,7 +882,7 @@ struct gpu_array_ref_group *gpu_array_ref_group_free( } /* Check if the access relations of group1 and group2 overlap within - * shared_sched. + * copy_sched. */ static int accesses_overlap(struct gpu_array_ref_group *group1, struct gpu_array_ref_group *group2) @@ -993,6 +1001,24 @@ static int check_requires_unroll(struct gpu_group_data *data, return !bijective; } +/* Map the domain of "access" to the outer data->shared_depth + * schedule dimensions. When data->shared_depth is equal to + * data->thread_depth, this result is already available in group->access. + */ +static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group, + __isl_keep isl_union_map *access, struct gpu_group_data *data) +{ + isl_union_map *shared; + + if (data->shared_depth == data->thread_depth) + return isl_map_copy(group->access); + + shared = isl_union_map_copy(access); + shared = isl_union_map_apply_domain(shared, + isl_union_map_copy(data->shared_sched)); + return isl_map_from_union_map(shared); +} + /* Compute the private and/or shared memory tiles for the array * reference group "group" of array "array". * Return 0 on success and -1 on error. @@ -1104,11 +1130,13 @@ static int compute_group_bounds_core(struct ppcg_kernel *kernel, if (use_shared && (!no_reuse || !coalesced)) { group->shared_tile = gpu_array_tile_create(ctx, group->array->n_index); + acc = shared_access(group, access, data); if (!group->shared_tile) r = -1; - else if (!can_tile(group->access, group->shared_tile)) + else if (!can_tile(acc, group->shared_tile)) group->shared_tile = gpu_array_tile_free(group->shared_tile); + isl_map_free(acc); } if (r < 0 || (!force_private && (!use_private || no_reuse))) { @@ -1599,6 +1627,7 @@ int gpu_group_references(struct ppcg_kernel *kernel, node = isl_schedule_node_copy(node); node = gpu_tree_move_down_to_thread(node, kernel->core); + data.shared_depth = isl_schedule_node_get_schedule_depth(node); data.shared_sched = isl_schedule_node_get_prefix_schedule_relation(node); data.shared_sched = isl_union_map_detect_equalities(data.shared_sched); @@ -1606,7 +1635,8 @@ int gpu_group_references(struct ppcg_kernel *kernel, node = isl_schedule_node_child(node, 0); data.thread_depth = isl_schedule_node_get_schedule_depth(node); data.n_thread = isl_schedule_node_band_n_member(node); - data.thread_sched = isl_union_map_copy(data.shared_sched); + data.copy_sched = isl_union_map_copy(data.shared_sched); + data.thread_sched = isl_union_map_copy(data.copy_sched); data.thread_sched = isl_union_map_flat_range_product(data.thread_sched, isl_schedule_node_band_get_partial_schedule_union_map(node)); data.thread_sched = isl_union_map_detect_equalities(data.thread_sched); @@ -1614,6 +1644,8 @@ int gpu_group_references(struct ppcg_kernel *kernel, contraction = isl_union_pw_multi_aff_copy(kernel->contraction); data.host_sched = expand(data.host_sched, contraction); data.shared_sched = expand(data.shared_sched, contraction); + isl_union_map_free(data.copy_sched); + data.copy_sched = isl_union_map_copy(data.shared_sched); data.thread_sched = expand(data.thread_sched, contraction); isl_union_pw_multi_aff_free(contraction); @@ -1633,6 +1665,7 @@ int gpu_group_references(struct ppcg_kernel *kernel, isl_union_map_free(data.host_sched); isl_union_map_free(data.shared_sched); + isl_union_map_free(data.copy_sched); isl_union_map_free(data.thread_sched); isl_union_map_free(data.full_sched); isl_set_free(data.privatization); diff --git a/gpu_group.h b/gpu_group.h index 9a3e970..c94812d 100644 --- a/gpu_group.h +++ b/gpu_group.h @@ -20,7 +20,7 @@ struct gpu_array_ref_group { int nr; /* The following fields are use during the construction of the groups. - * access is the combined access relation relative to the shared + * access is the combined access relation relative to the private * memory tiling. In particular, the domain of the map corresponds * to the first thread_depth dimensions of the kernel schedule. * write is set if any access in the group is a write. -- 2.11.4.GIT