From cf470867baedce340eff08a80afb6e9b3018d7ea Mon Sep 17 00:00:00 2001
From: Sven Verdoolaege <skimo@kotnet.org>
Date: Wed, 2 Oct 2013 21:54:23 +0200
Subject: [PATCH] gpu_group_references: distinguish between shared_schedule and
 copy_schedule

The initial mapping to shared memory is currently computed right outside
the band that is mapped to threads.  In a subsequent commit, it will
be made possible to introduce this mapping at a higher level.
In preparation, adjust gpu_group_references to take into account
this distinction by introducing a copy_schedule that is used for
determining the initial overlap between accesses.
In particular, when a reference group can only be mapped to private
memory, overlap will still be determined at the deepest level.

Signed-off-by: Sven Verdoolaege <skimo@kotnet.org>
---
 gpu_group.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
 gpu_group.h |  2 +-
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/gpu_group.c b/gpu_group.c
index efbb916..199e99f 100644
--- a/gpu_group.c
+++ b/gpu_group.c
@@ -439,18 +439,24 @@ static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
  * kernel_depth is the schedule depth where the kernel launch will
  * be introduced, i.e., it is the depth of the band that is mapped
  * to blocks.
+ * shared_depth is the schedule depth at which the copying to/from
+ * shared memory is computed.  The copy operation may then
+ * later be hoisted to a higher level.
  * thread_depth is the schedule depth where the thread mark is located,
  * i.e., it is the depth of the band that is mapped to threads and also
- * the schedule depth at which the copying to/from shared/private memory
+ * the schedule depth at which the copying to/from private memory
  * is computed.  The copy operation may then later be hoisted to
  * a higher level.
+ * Currently, shared_depth is equal to thread_depth.
  * n_thread is the number of schedule dimensions in the band that
  * is mapped to threads.
  * privatization lives in the range of thread_sched (i.e., it is
  * of dimension thread_depth + n_thread) and encodes the mapping
  * to thread identifiers (as parameters).
  * host_sched contains the kernel_depth dimensions of the host schedule.
- * shared_sched contains the first thread_depth dimensions of the
+ * shared_sched contains the first shared_depth dimensions of the
+ * kernel schedule.
+ * copy_sched contains the first thread_depth dimensions of the
  * kernel schedule.
  * thread_sched contains the first (thread_depth + n_thread) dimensions
  * of the kernel schedule.
@@ -462,11 +468,13 @@ static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
 struct gpu_group_data {
 	struct ppcg_scop *scop;
 	int kernel_depth;
+	int shared_depth;
 	int thread_depth;
 	int n_thread;
 	isl_set *privatization;
 	isl_union_map *host_sched;
 	isl_union_map *shared_sched;
+	isl_union_map *copy_sched;
 	isl_union_map *thread_sched;
 	isl_union_map *full_sched;
 };
@@ -813,7 +821,7 @@ static int populate_array_references(struct gpu_local_array_info *local,
 {
 	int i;
 	int n;
-	isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched);
+	isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched);
 
 	n = 0;
 	for (i = 0; i < local->array->n_ref; ++i) {
@@ -825,7 +833,7 @@ static int populate_array_references(struct gpu_local_array_info *local,
 		map = isl_map_copy(access->access);
 		umap = isl_union_map_from_map(map);
 		umap = isl_union_map_apply_domain(umap,
-				isl_union_map_copy(data->shared_sched));
+				isl_union_map_copy(data->copy_sched));
 
 		if (isl_union_map_is_empty(umap)) {
 			isl_union_map_free(umap);
@@ -874,7 +882,7 @@ struct gpu_array_ref_group *gpu_array_ref_group_free(
 }
 
 /* Check if the access relations of group1 and group2 overlap within
- * shared_sched.
+ * copy_sched.
  */
 static int accesses_overlap(struct gpu_array_ref_group *group1,
 	struct gpu_array_ref_group *group2)
@@ -993,6 +1001,24 @@ static int check_requires_unroll(struct gpu_group_data *data,
 	return !bijective;
 }
 
+/* Map the domain of "access" to the outer data->shared_depth
+ * schedule dimensions.  When data->shared_depth is equal to
+ * data->thread_depth, this result is already available in group->access.
+ */
+static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group,
+	__isl_keep isl_union_map *access, struct gpu_group_data *data)
+{
+	isl_union_map *shared;
+
+	if (data->shared_depth == data->thread_depth)
+		return isl_map_copy(group->access);
+
+	shared = isl_union_map_copy(access);
+	shared = isl_union_map_apply_domain(shared,
+			isl_union_map_copy(data->shared_sched));
+	return isl_map_from_union_map(shared);
+}
+
 /* Compute the private and/or shared memory tiles for the array
  * reference group "group" of array "array".
  * Return 0 on success and -1 on error.
@@ -1104,11 +1130,13 @@ static int compute_group_bounds_core(struct ppcg_kernel *kernel,
 	if (use_shared && (!no_reuse || !coalesced)) {
 		group->shared_tile = gpu_array_tile_create(ctx,
 							group->array->n_index);
+		acc = shared_access(group, access, data);
 		if (!group->shared_tile)
 			r = -1;
-		else if (!can_tile(group->access, group->shared_tile))
+		else if (!can_tile(acc, group->shared_tile))
 			group->shared_tile =
 					gpu_array_tile_free(group->shared_tile);
+		isl_map_free(acc);
 	}
 
 	if (r < 0 || (!force_private && (!use_private || no_reuse))) {
@@ -1599,6 +1627,7 @@ int gpu_group_references(struct ppcg_kernel *kernel,
 
 	node = isl_schedule_node_copy(node);
 	node = gpu_tree_move_down_to_thread(node, kernel->core);
+	data.shared_depth = isl_schedule_node_get_schedule_depth(node);
 	data.shared_sched =
 		isl_schedule_node_get_prefix_schedule_relation(node);
 	data.shared_sched = isl_union_map_detect_equalities(data.shared_sched);
@@ -1606,7 +1635,8 @@ int gpu_group_references(struct ppcg_kernel *kernel,
 	node = isl_schedule_node_child(node, 0);
 	data.thread_depth = isl_schedule_node_get_schedule_depth(node);
 	data.n_thread = isl_schedule_node_band_n_member(node);
-	data.thread_sched = isl_union_map_copy(data.shared_sched);
+	data.copy_sched = isl_union_map_copy(data.shared_sched);
+	data.thread_sched = isl_union_map_copy(data.copy_sched);
 	data.thread_sched = isl_union_map_flat_range_product(data.thread_sched,
 		isl_schedule_node_band_get_partial_schedule_union_map(node));
 	data.thread_sched = isl_union_map_detect_equalities(data.thread_sched);
@@ -1614,6 +1644,8 @@ int gpu_group_references(struct ppcg_kernel *kernel,
 	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
 	data.host_sched = expand(data.host_sched, contraction);
 	data.shared_sched = expand(data.shared_sched, contraction);
+	isl_union_map_free(data.copy_sched);
+	data.copy_sched = isl_union_map_copy(data.shared_sched);
 	data.thread_sched = expand(data.thread_sched, contraction);
 	isl_union_pw_multi_aff_free(contraction);
 
@@ -1633,6 +1665,7 @@ int gpu_group_references(struct ppcg_kernel *kernel,
 
 	isl_union_map_free(data.host_sched);
 	isl_union_map_free(data.shared_sched);
+	isl_union_map_free(data.copy_sched);
 	isl_union_map_free(data.thread_sched);
 	isl_union_map_free(data.full_sched);
 	isl_set_free(data.privatization);
diff --git a/gpu_group.h b/gpu_group.h
index 9a3e970..c94812d 100644
--- a/gpu_group.h
+++ b/gpu_group.h
@@ -20,7 +20,7 @@ struct gpu_array_ref_group {
 	int nr;
 
 	/* The following fields are use during the construction of the groups.
-	 * access is the combined access relation relative to the shared
+	 * access is the combined access relation relative to the private
 	 * memory tiling.  In particular, the domain of the map corresponds
 	 * to the first thread_depth dimensions of the kernel schedule.
 	 * write is set if any access in the group is a write.
-- 
2.11.4.GIT