From a6c2e58e2a324c4faace7319d03d7aedb3d0fca4 Mon Sep 17 00:00:00 2001
From: Sven Verdoolaege <skimo@kotnet.org>
Date: Sat, 4 Feb 2012 15:19:56 +0100
Subject: [PATCH] restrict grid to actually executed blocks

The grid sizes used for wrapping iterations over the blocks
in the grid (either the default sizes or those specified by
the user) may be larger than the actual number of blocks
required to run the kernel, possibly depending on the parameter
values.  With this commit, we now construct a box containing
the origin and the ids of the blocks that actually need
to do something.  There may still be blocks in there that
have no work, but there should be fewer of them than before.

Signed-off-by: Sven Verdoolaege <skimo@kotnet.org>
---
 cuda.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 5 deletions(-)

diff --git a/cuda.c b/cuda.c
index c5dfcdd..c2beecc 100644
--- a/cuda.c
+++ b/cuda.c
@@ -3656,6 +3656,87 @@ static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
 	gen->n_parallel = stmt->n_parallel;
 }
 
+/* Extract a description of the grid, i.e., the possible values
+ * of the block ids, from gen->tiled_sched.
+ * The block ids are parameters in gen->tiled_sched.
+ * We simply need to change them into set dimensions.
+ */
+static __isl_give isl_set *extract_grid(struct cuda_gen *gen)
+{
+	int i;
+	isl_set *grid;
+
+	grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
+	grid = isl_set_from_params(grid);
+	grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
+	for (i = 0; i < gen->n_grid; ++i) {
+		int pos;
+		char name[20];
+
+		snprintf(name, sizeof(name), "b%d", i);
+		pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
+		assert(pos >= 0);
+		grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
+		grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
+	}
+
+	return grid;
+}
+
+/* Print the effective grid size as a list of the sizes in each
+ * dimension, from innermost to outermost.
+ *
+ * The grid size specified by the user or set by default
+ * in read_grid_sizes() and applied in tile_schedule(),
+ * may be too large for the given code in the sense that
+ * it may contain blocks that don't need to execute anything.
+ * We therefore don't print this grid size, but instead the
+ * smallest grid size that ensures that all blocks that actually
+ * execute code are included in the grid.
+ *
+ * For each block dimension, we compute the maximal value of the block id
+ * and add one.
+ */
+static void print_grid_size(struct cuda_gen *gen, __isl_take isl_set *context)
+{
+	int i;
+	isl_printer *prn;
+	isl_set *grid;
+
+	if (gen->grid_dim == 0)
+		return;
+
+	grid = extract_grid(gen);
+
+	prn = isl_printer_to_file(gen->ctx, gen->cuda.host_c);
+	prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
+
+	prn = isl_printer_print_str(prn, "(");
+	for (i = gen->n_grid - 1; i >= 0; --i) {
+		isl_space *space;
+		isl_aff *one;
+		isl_pw_aff *bound = isl_set_dim_max(isl_set_copy(grid), i);
+
+		bound = isl_pw_aff_coalesce(bound);
+		bound = isl_pw_aff_gist(bound, isl_set_copy(context));
+
+		space = isl_pw_aff_get_domain_space(bound);
+		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
+		one = isl_aff_add_constant_si(one, 1);
+		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
+		prn = isl_printer_print_pw_aff(prn, bound);
+		isl_pw_aff_free(bound);
+
+		if (i > 0)
+			prn = isl_printer_print_str(prn, ", ");
+	}
+	prn = isl_printer_print_str(prn, ")");
+
+	isl_printer_free(prn);
+	isl_set_free(grid);
+	isl_set_free(context);
+}
+
 /* This function is called for each leaf in the clast of the host code.
  * We first specialize the schedule to the site of the leaf, compute
  * the size of shared memory and then print the body of host code
@@ -3692,15 +3773,15 @@ static void print_host_user(struct gpucode_info *code,
 	print_reverse_list(code->dst, gen->n_block, gen->block_dim);
 	fprintf(code->dst, ";\n");
 
-	print_indent(code->dst, code->indent);
-	fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
-	print_reverse_list(code->dst, gen->n_grid, gen->grid_dim);
-	fprintf(code->dst, ";\n");
-
 	gen->tiled_sched = tile_schedule(gen, local_sched);
 	gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
 	gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
 
+	print_indent(code->dst, code->indent);
+	fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
+	print_grid_size(gen, isl_set_params(isl_set_copy(host_domain)));
+	fprintf(code->dst, ";\n");
+
 	gen->local_sched = isl_union_map_copy(gen->tiled_sched);
 
 	dim = isl_union_map_get_space(gen->local_sched);
-- 
2.11.4.GIT