Remove unused variable
[ppcg.git] / cuda.c
blob0cb6e83f311fa0aac3fd7d5fd73fa5c47ecf5d4a
1 /*
2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
8 * 91893 Orsay, France
9 */
11 #include <assert.h>
12 #include <stdlib.h>
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
16 #include <isl/aff.h>
17 #include <isl/ilp.h>
18 #include <isl/flow.h>
19 #include <isl/band.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
24 #include "cuda.h"
25 #include "cuda_common.h"
26 #include "clast_printer.h"
27 #include "schedule.h"
28 #include "pet_printer.h"
29 #include "ppcg_options.h"
31 /* The fields stride, shift and shift_map only contain valid information
32 * if shift != NULL.
33 * If so, they express that current index is such that if you add shift,
34 * then the result is always a multiple of stride.
35 * shift_map contains the mapping
37 * i -> (i + shift)/stride
39 struct cuda_array_bound {
40 isl_int size;
41 isl_aff *lb;
43 isl_int stride;
44 isl_aff *shift;
45 isl_basic_map *shift_map;
48 struct cuda_array_info;
50 /* A group of array references in a kernel that should be handled together.
51 * If private_bound is not NULL, then it is mapped to registers.
52 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
53 * Otherwise, it is accessed from global memory.
55 struct cuda_array_ref_group {
56 /* The references in this group access this array. */
57 struct cuda_array_info *array;
58 /* Position of this group in the list of reference groups of array. */
59 int nr;
61 /* The following fields are use during the construction of the groups.
62 * access is the combined access relation relative to the shared
63 * memory tiling.
64 * write is set if any access in the group is a write.
66 isl_map *access;
67 int write;
69 /* For each index, size and offset of piece in shared memory. */
70 struct cuda_array_bound *shared_bound;
72 /* For each index, size and offset of piece in private memory. */
73 struct cuda_array_bound *private_bound;
75 /* References in this group; point to elements of a linked list. */
76 int n_ref;
77 struct cuda_stmt_access **refs;
79 /* Last shared memory tile dimension that affects tile of this group. */
80 int last_shared;
81 /* Dimension at which copying to/from shared memory is printed.
82 * if >= 0, then the value is >= last_shared
83 * if -1, then the copying is done at the leaf level.
85 int print_shared_level;
88 struct cuda_array_info {
89 isl_space *dim;
90 /* Element type. */
91 char *type;
92 /* Element size. */
93 int size;
94 /* Name of the array. */
95 char *name;
96 /* Number of indices. */
97 unsigned n_index;
98 /* For each index, a bound on the array in that direction. */
99 isl_pw_aff **bound;
100 /* For each index, bound[i] specialized to the current kernel. */
101 isl_pw_aff **local_bound;
103 /* All references to this array; point to elements of a linked list. */
104 int n_ref;
105 struct cuda_stmt_access **refs;
107 /* The reference groups associated to this array. */
108 int n_group;
109 struct cuda_array_ref_group **groups;
111 /* For scalars, is this scalar read-only within the entire program? */
112 int read_only;
115 /* Print the name of the local copy of a given group of array references.
117 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
119 int global = 0;
121 if (group->private_bound)
122 fprintf(out, "private_");
123 else if (group->shared_bound)
124 fprintf(out, "shared_");
125 else
126 global = 1;
127 fprintf(out, "%s", group->array->name);
128 if (!global && group->array->n_group > 1)
129 fprintf(out, "_%d", group->nr);
132 /* Collect all references to the given array and store pointers to them
133 * in array->refs.
135 static void collect_references(struct cuda_gen *gen,
136 struct cuda_array_info *array)
138 int i;
139 int n;
141 n = 0;
142 for (i = 0; i < gen->n_stmts; ++i) {
143 struct cuda_stmt *stmt = &gen->stmts[i];
144 struct cuda_stmt_access *access;
146 for (access = stmt->accesses; access; access = access->next) {
147 const char *name;
148 name = isl_map_get_tuple_name(access->access,
149 isl_dim_out);
150 if (name && !strcmp(array->name, name))
151 n++;
155 array->n_ref = n;
156 array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
157 assert(array->refs);
159 n = 0;
160 for (i = 0; i < gen->n_stmts; ++i) {
161 struct cuda_stmt *stmt = &gen->stmts[i];
162 struct cuda_stmt_access *access;
164 for (access = stmt->accesses; access; access = access->next) {
165 const char *name;
166 name = isl_map_get_tuple_name(access->access,
167 isl_dim_out);
168 if (!name || strcmp(array->name, name))
169 continue;
171 array->refs[n++] = access;
176 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
178 int i;
179 struct cuda_array_bound *bound;
181 bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
182 assert(bound);
184 for (i = 0; i < n_index; ++i) {
185 isl_int_init(bound[i].size);
186 bound[i].lb = NULL;
187 isl_int_init(bound[i].stride);
188 bound[i].shift = NULL;
189 bound[i].shift_map = NULL;
192 return bound;
195 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
197 int j;
199 if (!bound)
200 return;
202 for (j = 0; j < n_index; ++j) {
203 isl_int_clear(bound[j].size);
204 isl_int_clear(bound[j].stride);
205 isl_aff_free(bound[j].lb);
206 isl_aff_free(bound[j].shift);
207 isl_basic_map_free(bound[j].shift_map);
209 free(bound);
212 static struct pet_array *find_array(struct pet_scop *scop,
213 __isl_keep isl_set *accessed)
215 int i;
216 isl_id *id;
218 id = isl_set_get_tuple_id(accessed);
220 for (i = 0; i < scop->n_array; ++i) {
221 isl_id *id_i;
223 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
224 isl_id_free(id_i);
225 if (id == id_i)
226 break;
228 isl_id_free(id);
230 return i < scop->n_array ? scop->arrays[i] : NULL;
233 /* Compute bounds on the host arrays based on the accessed elements
234 * and collect all references to the array.
236 * If the array is zero-dimensional, i.e., a scalar, we check
237 * whether it is read-only.
239 static int extract_array_info(__isl_take isl_set *array, void *user)
241 int i;
242 struct cuda_gen *gen = (struct cuda_gen *)user;
243 const char *name;
244 int n_index;
245 isl_pw_aff **bounds;
246 isl_pw_aff **local_bounds;
247 struct pet_array *pa;
249 n_index = isl_set_dim(array, isl_dim_set);
250 name = isl_set_get_tuple_name(array);
251 bounds = isl_alloc_array(isl_set_get_ctx(array),
252 isl_pw_aff *, n_index);
253 assert(bounds);
254 local_bounds = isl_calloc_array(isl_set_get_ctx(array),
255 isl_pw_aff *, n_index);
256 assert(local_bounds);
257 gen->array[gen->n_array].dim = isl_set_get_space(array);
258 gen->array[gen->n_array].name = strdup(name);
259 gen->array[gen->n_array].n_index = n_index;
260 gen->array[gen->n_array].bound = bounds;
261 gen->array[gen->n_array].local_bound = local_bounds;
263 pa = find_array(gen->scop, array);
264 assert(pa);
266 gen->array[gen->n_array].type = strdup(pa->element_type);
267 gen->array[gen->n_array].size = pa->element_size;
269 if (n_index == 0) {
270 isl_set *space;
271 isl_union_map *write;
272 int empty;
274 write = isl_union_map_copy(gen->write);
275 space = isl_set_universe(isl_set_get_space(array));
276 write = isl_union_map_intersect_range(write,
277 isl_union_set_from_set(space));
278 empty = isl_union_map_is_empty(write);
279 isl_union_map_free(write);
281 gen->array[gen->n_array].read_only = empty;
284 for (i = 0; i < n_index; ++i) {
285 isl_set *dom;
286 isl_local_space *ls;
287 isl_aff *one;
288 isl_pw_aff *bound;
289 isl_set *size = i == 0 ? array : pa->extent;
291 bound = isl_set_dim_max(isl_set_copy(size), i);
292 assert(bound);
293 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
294 ls = isl_local_space_from_space(isl_set_get_space(dom));
295 one = isl_aff_zero_on_domain(ls);
296 one = isl_aff_add_constant_si(one, 1);
297 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
298 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
300 bounds[i] = bound;
303 collect_references(gen, &gen->array[gen->n_array]);
305 gen->n_array++;
307 isl_set_free(array);
308 return 0;
311 void collect_array_info(struct cuda_gen *gen)
313 isl_union_set *arrays;
315 arrays = isl_union_map_range(isl_union_map_copy(gen->read));
316 arrays = isl_union_set_union(arrays,
317 isl_union_map_range(isl_union_map_copy(gen->write)));
318 arrays = isl_union_set_coalesce(arrays);
320 gen->n_array = isl_union_set_n_set(arrays);
321 gen->array = isl_alloc_array(gen->ctx,
322 struct cuda_array_info, gen->n_array);
323 assert(gen->array);
324 gen->n_array = 0;
325 isl_union_set_foreach_set(arrays, &extract_array_info, gen);
326 isl_union_set_free(arrays);
329 static void free_array_info(struct cuda_gen *gen)
331 int i, j;
333 for (i = 0; i < gen->n_array; ++i) {
334 int n_index = gen->array[i].n_index;
335 free(gen->array[i].type);
336 free(gen->array[i].name);
337 for (j = 0; j < n_index; ++j) {
338 isl_pw_aff_free(gen->array[i].bound[j]);
339 isl_pw_aff_free(gen->array[i].local_bound[j]);
341 isl_space_free(gen->array[i].dim);
342 free(gen->array[i].bound);
343 free(gen->array[i].local_bound);
344 free(gen->array[i].refs);
346 free(gen->array);
349 /* Check if a cuda array is a scalar. A scalar is a value that is not stored
350 * as an array or through a pointer reference, but as single data element. At
351 * the moment, scalars are represented as zero dimensional arrays.
353 static int cuda_array_is_scalar(struct cuda_array_info *array)
355 return (array->n_index == 0);
358 /* Is "array" a read-only scalar?
360 static int cuda_array_is_read_only_scalar(struct cuda_array_info *array)
362 return cuda_array_is_scalar(array) && array->read_only;
365 static void declare_device_arrays(struct cuda_gen *gen)
367 int i;
369 for (i = 0; i < gen->n_array; ++i) {
370 if (cuda_array_is_read_only_scalar(&gen->array[i]))
371 continue;
372 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
373 gen->array[i].type, gen->array[i].name);
375 fprintf(gen->cuda.host_c, "\n");
378 static void print_array_size(struct cuda_gen *gen, FILE *out,
379 struct cuda_array_info *array)
381 int i;
382 isl_printer *prn;
384 prn = isl_printer_to_file(gen->ctx, out);
385 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
386 for (i = 0; i < array->n_index; ++i) {
387 prn = isl_printer_print_str(prn, "(");
388 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
389 prn = isl_printer_print_str(prn, ") * ");
391 prn = isl_printer_print_str(prn, "sizeof(");
392 prn = isl_printer_print_str(prn, array->type);
393 prn = isl_printer_print_str(prn, ")");
394 isl_printer_free(prn);
397 static void allocate_device_arrays(struct cuda_gen *gen)
399 int i;
401 for (i = 0; i < gen->n_array; ++i) {
402 if (cuda_array_is_read_only_scalar(&gen->array[i]))
403 continue;
404 fprintf(gen->cuda.host_c,
405 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
406 gen->array[i].name);
407 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
408 fprintf(gen->cuda.host_c, "));\n");
410 fprintf(gen->cuda.host_c, "\n");
413 static void free_device_arrays(struct cuda_gen *gen)
415 int i;
417 for (i = 0; i < gen->n_array; ++i) {
418 if (cuda_array_is_read_only_scalar(&gen->array[i]))
419 continue;
420 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaFree(dev_%s));\n",
421 gen->array[i].name);
425 static void copy_arrays_to_device(struct cuda_gen *gen)
427 int i;
429 for (i = 0; i < gen->n_array; ++i) {
430 isl_space *dim;
431 isl_set *read_i;
432 int empty;
434 if (cuda_array_is_read_only_scalar(&gen->array[i]))
435 continue;
437 dim = isl_space_copy(gen->array[i].dim);
438 read_i = isl_union_set_extract_set(gen->copy_in, dim);
439 empty = isl_set_fast_is_empty(read_i);
440 isl_set_free(read_i);
441 if (empty)
442 continue;
444 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(dev_%s,",
445 gen->array[i].name);
447 if (cuda_array_is_scalar(&(gen->array[i])))
448 fprintf(gen->cuda.host_c, " &%s, ",
449 gen->array[i].name);
450 else
451 fprintf(gen->cuda.host_c, " %s, ", gen->array[i].name);
453 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
454 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice));\n");
456 fprintf(gen->cuda.host_c, "\n");
459 static void copy_arrays_from_device(struct cuda_gen *gen)
461 int i;
462 isl_union_set *write;
463 write = isl_union_map_range(isl_union_map_copy(gen->write));
465 for (i = 0; i < gen->n_array; ++i) {
466 isl_space *dim;
467 isl_set *write_i;
468 int empty;
470 dim = isl_space_copy(gen->array[i].dim);
471 write_i = isl_union_set_extract_set(write, dim);
472 empty = isl_set_fast_is_empty(write_i);
473 isl_set_free(write_i);
474 if (empty)
475 continue;
477 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(");
478 if (cuda_array_is_scalar(&gen->array[i]))
479 fprintf(gen->cuda.host_c, "&%s, ", gen->array[i].name);
480 else
481 fprintf(gen->cuda.host_c, "%s, ", gen->array[i].name);
482 fprintf(gen->cuda.host_c, "dev_%s, ", gen->array[i].name);
483 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
484 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost));\n");
487 isl_union_set_free(write);
488 fprintf(gen->cuda.host_c, "\n");
491 static void read_sizes_from_file(struct cuda_gen *gen, const char *filename,
492 int *sizes, int len)
494 int i;
495 FILE *file;
497 file = fopen(filename, "r");
498 if (!file)
499 return;
501 for (i = 0; i < len; ++i)
502 if (fscanf(file, "%d", &sizes[i]) < 1)
503 break;
505 fclose(file);
508 /* Internal data structure for extract_size_of_type.
509 * "type" specifies the name of the space that we want to extract.
510 * "res" is used to store the subset of that space.
512 struct ppcg_extract_size_data {
513 const char *type;
514 isl_set *res;
517 /* This function is called for each set in a union_set.
518 * If the name of the set matches data->type, we store the
519 * set in data->res.
521 static int extract_size_of_type(__isl_take isl_set *size, void *user)
523 struct ppcg_extract_size_data *data = user;
524 const char *name;
526 name = isl_set_get_tuple_name(size);
527 if (name && !strcmp(name, data->type)) {
528 data->res = size;
529 return -1;
532 isl_set_free(size);
533 return 0;
536 /* Given a union map { kernel[i] -> *[...] },
537 * return the range in the space called "type" for the kernel with
538 * sequence number "id".
540 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
541 const char *type, int id)
543 isl_space *space;
544 isl_set *dom;
545 isl_union_set *local_sizes;
546 struct ppcg_extract_size_data data = { type, NULL };
548 if (!sizes)
549 return NULL;
551 space = isl_union_map_get_space(sizes);
552 space = isl_space_set_from_params(space);
553 space = isl_space_add_dims(space, isl_dim_set, 1);
554 space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
555 dom = isl_set_universe(space);
556 dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
558 local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
559 isl_union_map_copy(sizes));
560 isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
561 isl_union_set_free(local_sizes);
562 return data.res;
565 /* Given a singleton set, extract the first (at most *len) elements
566 * of the single integer tuple into *sizes and update *len if needed.
568 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
570 int i;
571 int dim;
572 isl_int v;
574 if (!set)
575 return;
577 dim = isl_set_dim(set, isl_dim_set);
578 if (dim < *len)
579 *len = dim;
581 isl_int_init(v);
583 for (i = 0; i < *len; ++i) {
584 int ok;
586 ok = isl_set_plain_is_fixed(set, isl_dim_set, i, &v);
587 assert(ok);
589 sizes[i] = isl_int_get_si(v);
592 isl_int_clear(v);
594 isl_set_free(set);
597 /* Extract user specified "tile" sizes from the "sizes" command line option,
598 * defaulting to option->tile_size in each dimension.
600 static void read_tile_sizes(struct cuda_gen *gen)
602 int n;
603 isl_set *size;
605 gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
606 assert(gen->tile_size);
607 for (n = 0; n < gen->tile_len; ++n)
608 gen->tile_size[n] = gen->options->tile_size;
610 size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
611 read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
613 if (gen->n_parallel > gen->tile_len)
614 gen->n_parallel = gen->tile_len;
617 /* Extract user specified "block" sizes from the "sizes" command line option,
618 * after filling in some potentially useful defaults.
620 static void read_block_sizes(struct cuda_gen *gen)
622 int n;
623 isl_set *size;
625 n = gen->n_parallel;
626 gen->n_block = (n <= 3) ? n : 3;
627 switch (gen->n_block) {
628 case 1:
629 gen->block_dim[0] = 512;
630 break;
631 case 2:
632 gen->block_dim[0] = 32;
633 gen->block_dim[1] = 16;
634 break;
635 default:
636 gen->block_dim[0] = 32;
637 gen->block_dim[1] = 4;
638 gen->block_dim[2] = 4;
639 break;
642 size = extract_sizes(gen->sizes, "block", gen->kernel_id);
643 read_sizes_from_set(size, gen->block_dim, &gen->n_block);
646 /* Extract user specified "grid" sizes from the "sizes" command line option,
647 * after filling in some potentially useful defaults.
649 static void read_grid_sizes(struct cuda_gen *gen)
651 int n = gen->n_parallel;
652 isl_set *size;
654 gen->n_grid = (n <= 2) ? n : 2;
655 switch (gen->n_grid) {
656 case 1:
657 gen->grid_dim[0] = 32768;
658 break;
659 default:
660 gen->grid_dim[0] = 256;
661 gen->grid_dim[1] = 256;
662 break;
665 size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
666 read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
669 /* Extract user specified sizes from the "sizes" command line option
670 * after filling in some potentially useful defaults.
672 static void read_sizes(struct cuda_gen *gen)
674 read_tile_sizes(gen);
675 read_block_sizes(gen);
676 read_grid_sizes(gen);
679 static void free_stmts(struct cuda_stmt *stmts, int n)
681 int i;
683 for (i = 0; i < n; ++i) {
684 struct cuda_stmt_access *access, *next;
686 for (access = stmts[i].accesses; access; access = next) {
687 next = access->next;
688 isl_map_free(access->access);
689 free(access);
692 isl_set_free(stmts[i].domain);
694 free(stmts);
697 void clear_cuda_gen(struct cuda_gen *gen)
699 free_stmts(gen->stmts, gen->n_stmts);
700 free_array_info(gen);
701 isl_union_map_free(gen->sizes);
702 isl_set_free(gen->context);
703 isl_union_set_free(gen->copy_in);
704 isl_union_map_free(gen->sched);
705 isl_union_map_free(gen->read);
706 isl_union_map_free(gen->write);
709 static void print_reverse_list(FILE *out, int len, int *list)
711 int i;
713 if (len == 0)
714 return;
716 fprintf(out, "(");
717 for (i = 0; i < len; ++i) {
718 if (i)
719 fprintf(out, ", ");
720 fprintf(out, "%d", list[len - 1 - i]);
722 fprintf(out, ")");
725 static void print_kernel_launch(struct cuda_gen *gen,
726 __isl_keep isl_union_set *arrays)
728 int i;
729 int first = 1;
730 unsigned nparam;
731 isl_space *dim;
733 print_indent(gen->code.dst, gen->code.indent);
734 fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
735 gen->kernel_id, gen->kernel_id, gen->kernel_id);
736 fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
737 gen->kernel_id);
738 fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
739 gen->kernel_id);
741 for (i = 0; i < gen->n_array; ++i) {
742 isl_space *dim;
743 isl_set *arr;
744 int empty;
746 dim = isl_space_copy(gen->array[i].dim);
747 arr = isl_union_set_extract_set(arrays, dim);
748 empty = isl_set_fast_is_empty(arr);
749 isl_set_free(arr);
750 if (empty)
751 continue;
753 if (!first) {
754 fprintf(gen->code.dst, ", ");
755 fprintf(gen->cuda.kernel_c, ", ");
756 fprintf(gen->cuda.kernel_h, ", ");
759 if (cuda_array_is_read_only_scalar(&gen->array[i])) {
760 fprintf(gen->code.dst, "%s", gen->array[i].name);
761 fprintf(gen->cuda.kernel_c, "%s %s",
762 gen->array[i].type, gen->array[i].name);
763 fprintf(gen->cuda.kernel_h, "%s %s",
764 gen->array[i].type, gen->array[i].name);
765 } else {
766 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
767 fprintf(gen->cuda.kernel_c, "%s *%s",
768 gen->array[i].type, gen->array[i].name);
769 fprintf(gen->cuda.kernel_h, "%s *%s",
770 gen->array[i].type, gen->array[i].name);
773 first = 0;
776 dim = isl_union_set_get_space(arrays);
777 nparam = isl_space_dim(dim, isl_dim_param);
778 for (i = 0; i < nparam; ++i) {
779 const char *name = isl_space_get_dim_name(dim, isl_dim_param, i);
780 if (!first) {
781 fprintf(gen->code.dst, ", ");
782 fprintf(gen->cuda.kernel_c, ", ");
783 fprintf(gen->cuda.kernel_h, ", ");
785 fprintf(gen->code.dst, "%s", name);
786 fprintf(gen->cuda.kernel_c, "int %s", name);
787 fprintf(gen->cuda.kernel_h, "int %s", name);
788 first = 0;
790 isl_space_free(dim);
792 for (i = 0; i < gen->tile_first; ++i) {
793 if (!first) {
794 fprintf(gen->code.dst, ", ");
795 fprintf(gen->cuda.kernel_c, ", ");
796 fprintf(gen->cuda.kernel_h, ", ");
798 fprintf(gen->code.dst, "h%d", i);
799 fprintf(gen->cuda.kernel_c, "int h%d", i);
800 fprintf(gen->cuda.kernel_h, "int h%d", i);
801 first = 0;
804 fprintf(gen->code.dst, ");\n");
805 fprintf(gen->cuda.kernel_c, ")\n");
806 fprintf(gen->cuda.kernel_h, ");\n");
808 fprintf(gen->code.dst, "cudaCheckKernel();\n");
811 /* Construct a map from a domain of dimensionality "len"
812 * to a domain of dimensionality "len" + "tile_len" that tiles
813 * the "tile_len" coordinates starting at "first".
814 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
815 * "dim" prescribes the parameters.
817 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
818 int first, int tile_len, int *tile_size)
820 int i;
821 isl_int v;
822 isl_basic_map *bmap;
823 isl_constraint *c;
824 isl_local_space *ls;
826 isl_int_init(v);
828 dim = isl_space_add_dims(dim, isl_dim_in, len);
829 dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
830 bmap = isl_basic_map_universe(isl_space_copy(dim));
831 ls = isl_local_space_from_space(dim);
833 for (i = 0; i < len - tile_len; ++i) {
834 int j = i < first ? i : i + tile_len;
835 int k = i < first ? i : i + 2 * tile_len;
837 c = isl_equality_alloc(isl_local_space_copy(ls));
838 isl_int_set_si(v, -1);
839 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
840 isl_int_set_si(v, 1);
841 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
842 bmap = isl_basic_map_add_constraint(bmap, c);
845 for (i = 0; i < tile_len; ++i) {
846 c = isl_equality_alloc(isl_local_space_copy(ls));
847 isl_int_set_si(v, -1);
848 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
849 isl_int_set_si(v, tile_size[i]);
850 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
851 isl_int_set_si(v, 1);
852 isl_constraint_set_coefficient(c, isl_dim_out,
853 first + i + tile_len, v);
854 bmap = isl_basic_map_add_constraint(bmap, c);
856 c = isl_inequality_alloc(isl_local_space_copy(ls));
857 isl_int_set_si(v, 1);
858 isl_constraint_set_coefficient(c, isl_dim_out,
859 first + i + tile_len, v);
860 bmap = isl_basic_map_add_constraint(bmap, c);
862 c = isl_inequality_alloc(isl_local_space_copy(ls));
863 isl_int_set_si(v, -1);
864 isl_constraint_set_coefficient(c, isl_dim_out,
865 first + i + tile_len, v);
866 isl_int_set_si(v, tile_size[i] - 1);
867 isl_constraint_set_constant(c, v);
868 bmap = isl_basic_map_add_constraint(bmap, c);
871 isl_local_space_free(ls);
872 isl_int_clear(v);
874 return isl_map_from_basic_map(bmap);
877 /* Construct a map from a domain of dimensionality "len"
878 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
879 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
880 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
881 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
882 * that are projected out at the end.
883 * "dim" prescribes the parameters.
885 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
886 int first, int wrap_len, int *wrap_size)
888 int i;
889 isl_basic_map *bmap;
890 isl_constraint *c;
891 isl_local_space *ls;
893 dim = isl_space_add_dims(dim, isl_dim_in, len);
894 dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
895 bmap = isl_basic_map_universe(isl_space_copy(dim));
896 ls = isl_local_space_from_space(dim);
898 for (i = 0; i < len; ++i) {
899 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
901 c = isl_equality_alloc(isl_local_space_copy(ls));
902 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
903 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
904 bmap = isl_basic_map_add_constraint(bmap, c);
907 for (i = 0; i < wrap_len; ++i) {
908 c = isl_equality_alloc(isl_local_space_copy(ls));
909 isl_constraint_set_coefficient_si(c, isl_dim_out,
910 first + i, -1);
911 isl_constraint_set_coefficient_si(c, isl_dim_out,
912 first + wrap_len + i, 1);
913 isl_constraint_set_coefficient_si(c, isl_dim_out,
914 first + 2 * wrap_len + i, wrap_size[i]);
915 bmap = isl_basic_map_add_constraint(bmap, c);
917 c = isl_inequality_alloc(isl_local_space_copy(ls));
918 isl_constraint_set_coefficient_si(c, isl_dim_out,
919 first + wrap_len + i, 1);
920 bmap = isl_basic_map_add_constraint(bmap, c);
922 c = isl_inequality_alloc(isl_local_space_copy(ls));
923 isl_constraint_set_coefficient_si(c, isl_dim_out,
924 first + wrap_len + i, -1);
925 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
926 bmap = isl_basic_map_add_constraint(bmap, c);
929 isl_local_space_free(ls);
931 bmap = isl_basic_map_project_out(bmap, isl_dim_out,
932 first + 2 * wrap_len, wrap_len);
934 return isl_map_from_basic_map(bmap);
937 /* Add "n" parameters named prefix%d.
939 static __isl_give isl_set *add_params( __isl_take isl_set *set,
940 int n, const char *prefix)
942 int i;
943 unsigned nparam;
944 char name[20];
946 nparam = isl_set_dim(set, isl_dim_param);
947 set = isl_set_add_dims(set, isl_dim_param, n);
949 for (i = 0; i < n; ++i) {
950 snprintf(name, sizeof(name), "%s%d", prefix, i);
951 set = isl_set_set_dim_name(set, isl_dim_param,
952 nparam + i, name);
955 return set;
958 /* Equate the "n" dimensions of "set" starting at "first" to
959 * freshly created parameters named prefix%d.
961 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
962 int first, int n, const char *prefix)
964 int i;
965 unsigned nparam;
966 isl_int v;
967 isl_space *dim;
968 isl_basic_set *bset;
969 isl_constraint *c;
970 isl_local_space *ls;
972 nparam = isl_set_dim(set, isl_dim_param);
974 set = add_params(set, n, prefix);
976 dim = isl_set_get_space(set);
977 bset = isl_basic_set_universe(isl_space_copy(dim));
978 ls = isl_local_space_from_space(dim);
980 isl_int_init(v);
982 for (i = 0; i < n; ++i) {
983 c = isl_equality_alloc(isl_local_space_copy(ls));
984 isl_int_set_si(v, -1);
985 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
986 isl_int_set_si(v, 1);
987 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
988 bset = isl_basic_set_add_constraint(bset, c);
991 isl_int_clear(v);
992 isl_local_space_free(ls);
994 return isl_set_intersect(set, isl_set_from_basic_set(bset));
997 static __isl_give isl_set *parametrization(__isl_take isl_space *dim,
998 int len, int first, int n, const char *prefix)
1000 isl_set *set;
1002 dim = isl_space_add_dims(dim, isl_dim_set, len);
1003 set = isl_set_universe(dim);
1005 return parametrize(set, first, n, prefix);
1008 /* Tile the B loops over the tile sizes and then tile/wrap
1009 * the T1 loops over the blocks.
1011 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
1012 __isl_take isl_union_map *sched)
1014 isl_space *dim;
1015 isl_map *tiling, *block_tiling;
1017 dim = isl_union_map_get_space(sched);
1018 tiling = tile(isl_space_copy(dim), gen->untiled_len,
1019 gen->tile_first, gen->tile_len, gen->tile_size);
1021 if (gen->options->wrap)
1022 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1023 gen->tile_first, gen->n_grid, gen->grid_dim);
1024 else
1025 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1026 gen->tile_first, gen->n_grid, gen->grid_dim);
1028 gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1030 tiling = isl_map_apply_range(tiling, block_tiling);
1032 sched = isl_union_map_apply_range(sched,
1033 isl_union_map_from_map(tiling));
1035 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1037 return sched;
1040 static __isl_give isl_union_map *parametrize_tiled_schedule(
1041 struct cuda_gen *gen, __isl_take isl_union_map *sched)
1043 isl_space *dim;
1044 isl_set *par;
1046 dim = isl_union_map_get_space(sched);
1047 par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
1048 sched = isl_union_map_intersect_range(sched,
1049 isl_union_set_from_set(par));
1051 dim = isl_union_map_get_space(sched);
1052 par = parametrization(dim, gen->tiled_len,
1053 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1054 sched = isl_union_map_intersect_range(sched,
1055 isl_union_set_from_set(par));
1057 return sched;
1060 /* Tile/wrap the P1 loops over the threads.
1062 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
1063 __isl_take isl_union_map *sched)
1065 isl_space *dim;
1066 isl_map *tiling;
1067 isl_set *par;
1069 dim = isl_union_map_get_space(sched);
1071 if (gen->options->wrap)
1072 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1073 gen->shared_len, gen->n_block, gen->block_dim);
1074 else
1075 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1076 gen->shared_len, gen->n_block, gen->block_dim);
1077 gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1079 sched = isl_union_map_apply_range(sched,
1080 isl_union_map_from_map(tiling));
1082 par = parametrization(dim, gen->thread_tiled_len,
1083 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1084 gen->n_block, "t");
1085 sched = isl_union_map_intersect_range(sched,
1086 isl_union_set_from_set(par));
1088 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1090 return sched;
1093 /* If the user asked for it, scale the shared memory tile loops
1094 * (T1T and T2) of "sched" by gen->tile_size[i].
1095 * If we are not performing "wrapping", then additionally scale the T1P
1096 * loops by gen->grid_dim[i].
1098 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
1099 __isl_take isl_union_map *sched)
1101 int i;
1102 isl_space *dim;
1103 isl_basic_map *scale;
1104 isl_constraint *c;
1105 isl_local_space *ls;
1107 if (!gen->options->scale_tile_loops)
1108 return sched;
1110 dim = isl_union_map_get_space(sched);
1111 dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1112 dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1113 scale = isl_basic_map_universe(isl_space_copy(dim));
1114 ls = isl_local_space_from_space(dim);
1116 for (i = 0; i < gen->tiled_len; ++i) {
1117 int f = 1;
1119 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1120 f = gen->tile_size[i - gen->tile_first];
1121 if (!gen->options->wrap)
1122 f *= gen->grid_dim[i - gen->tile_first];
1123 } else if (i >= gen->tile_first + gen->n_grid &&
1124 i < gen->tile_first + gen->n_grid + gen->tile_len) {
1125 f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1128 c = isl_equality_alloc(isl_local_space_copy(ls));
1129 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1130 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1131 scale = isl_basic_map_add_constraint(scale, c);
1134 isl_local_space_free(ls);
1136 sched = isl_union_map_apply_range(sched,
1137 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1139 return sched;
1142 /* If we are not performing "wrapping" and if the user asked for it,
1143 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1145 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
1146 __isl_take isl_union_map *sched)
1148 int i;
1149 isl_space *dim;
1150 isl_basic_map *scale;
1151 isl_constraint *c;
1152 isl_local_space *ls;
1154 if (gen->options->wrap)
1155 return sched;
1156 if (!gen->options->scale_tile_loops)
1157 return sched;
1159 dim = isl_union_map_get_space(sched);
1160 dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1161 dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1162 scale = isl_basic_map_universe(isl_space_copy(dim));
1163 ls = isl_local_space_from_space(dim);
1165 for (i = 0; i < gen->thread_tiled_len; ++i) {
1166 int f = 1;
1168 if (i >= gen->shared_len &&
1169 i < gen->shared_len + gen->n_block)
1170 f = gen->block_dim[i - gen->shared_len];
1172 c = isl_equality_alloc(isl_local_space_copy(ls));
1173 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1174 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1175 scale = isl_basic_map_add_constraint(scale, c);
1178 isl_local_space_free(ls);
1180 sched = isl_union_map_apply_range(sched,
1181 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1183 return sched;
1186 /* If we are not performing "wrapping" and if the user asked for it,
1187 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1189 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
1190 __isl_take isl_union_map *sched, int len, int first, int n_tile)
1192 int i;
1193 isl_space *dim;
1194 isl_basic_map *scale;
1195 isl_constraint *c;
1196 isl_local_space *ls;
1198 if (gen->options->wrap)
1199 return sched;
1200 if (!gen->options->scale_tile_loops)
1201 return sched;
1203 dim = isl_union_map_get_space(sched);
1204 dim = isl_space_add_dims(dim, isl_dim_in, len);
1205 dim = isl_space_add_dims(dim, isl_dim_out, len);
1206 scale = isl_basic_map_universe(isl_space_copy(dim));
1207 ls = isl_local_space_from_space(dim);
1209 for (i = 0; i < len; ++i) {
1210 int f = 1;
1212 if (i >= first && i < first + n_tile)
1213 f = gen->block_dim[i - first];
1215 c = isl_equality_alloc(isl_local_space_copy(ls));
1216 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1217 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1218 scale = isl_basic_map_add_constraint(scale, c);
1221 isl_local_space_free(ls);
1223 sched = isl_union_map_apply_range(sched,
1224 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1226 return sched;
1229 /* If print_user_stmt is set, we want to print the statements ourselves,
1230 * instead of relying on the C preprocessor. If so, we need to use
1231 * the stop option so that the domains will be saved on the statement
1232 * nodes.
1234 static void print_cloog_shared_body(struct cuda_gen *gen,
1235 __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1236 void (*print_user_stmt)(struct clast_printer_info *info,
1237 struct clast_user_stmt *s),
1238 int first_unroll)
1240 int i;
1241 CloogOptions *options;
1242 CloogDomain *cloog_context;
1243 CloogUnionDomain *ud;
1244 CloogInput *input;
1245 struct clast_stmt *stmt;
1246 char name[20];
1248 sched = isl_union_map_copy(sched);
1249 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
1251 options = cloog_options_malloc(gen->state);
1252 options->language = CLOOG_LANGUAGE_C;
1253 options->strides = 1;
1254 options->sh = 1;
1255 options->f = len;
1256 options->l = -1;
1257 options->override = 1;
1258 options->save_domains = 1;
1259 options->noscalars = 1;
1260 options->first_unroll = first_unroll;
1262 ud = cloog_union_domain_from_isl_union_map(sched);
1263 for (i = 0; i < len; ++i) {
1264 snprintf(name, sizeof(name), "c%d", i);
1265 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1267 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1268 input = cloog_input_alloc(cloog_context, ud);
1270 stmt = cloog_clast_create_from_input(input, options);
1272 gen->stmt_code.indent = gen->kernel_code.indent;
1273 gen->stmt_code.dst = gen->cuda.kernel_c;
1274 gen->stmt_code.print_user_stmt = print_user_stmt;
1275 gen->stmt_code.print_user_stmt_list = NULL;
1276 gen->stmt_code.print_for_head = NULL;
1277 gen->stmt_code.print_for_foot = NULL;
1278 gen->stmt_code.user = gen;
1279 print_clast(&gen->stmt_code, stmt);
1281 cloog_clast_free(stmt);
1282 cloog_options_free(options);
1285 /* Add "len" parameters p[i] called prefix%d,
1286 * with bounds to 0 <= p[i] < size[i].
1288 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1289 int len, int *size, const char *prefix)
1291 int i;
1292 unsigned nparam;
1293 isl_int v;
1294 isl_space *dim;
1295 isl_basic_set *bset;
1296 isl_constraint *c;
1297 isl_local_space *ls;
1298 char name[20];
1300 nparam = isl_set_dim(set, isl_dim_param);
1301 set = isl_set_add_dims(set, isl_dim_param, len);
1303 for (i = 0; i < len; ++i) {
1304 snprintf(name, sizeof(name), "%s%d", prefix, i);
1305 set = isl_set_set_dim_name(set, isl_dim_param,
1306 nparam + i, name);
1309 dim = isl_set_get_space(set);
1310 bset = isl_basic_set_universe(isl_space_copy(dim));
1311 ls = isl_local_space_from_space(dim);
1313 isl_int_init(v);
1315 for (i = 0; i < len; ++i) {
1316 c = isl_inequality_alloc(isl_local_space_copy(ls));
1317 isl_int_set_si(v, 1);
1318 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1319 bset = isl_basic_set_add_constraint(bset, c);
1321 c = isl_inequality_alloc(isl_local_space_copy(ls));
1322 isl_int_set_si(v, -1);
1323 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1324 isl_int_set_si(v, size[i] - 1);
1325 isl_constraint_set_constant(c, v);
1326 bset = isl_basic_set_add_constraint(bset, c);
1329 isl_int_clear(v);
1330 isl_local_space_free(ls);
1332 return isl_set_intersect(set, isl_set_from_basic_set(bset));
1335 static void print_shared_body(struct cuda_gen *gen,
1336 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1337 int len, void (*print_user_stmt)(struct clast_printer_info *info,
1338 struct clast_user_stmt *s),
1339 int first_unroll)
1341 isl_set *context;
1343 context = isl_set_copy(shared_domain);
1344 context = parametrize(context, 0, gen->shared_len, "g");
1345 context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1346 context = add_bounded_parameters(context,
1347 gen->n_block, gen->block_dim, "t");
1349 print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1350 first_unroll);
1352 isl_set_free(context);
1355 /* Given a tile of an array, construct a map that maps each element
1356 * of the tile to a copy of the tile shifted to the origin
1357 * (based on the lower bounds in group->private_bound or group->shared_bound).
1358 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1359 * is applied to the index first.
1360 * The domain of the resulting map is "access",
1361 * while the range space is anonymous.
1363 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1364 struct cuda_array_ref_group *group)
1366 int i;
1367 isl_space *dim;
1368 isl_basic_set *bset;
1369 isl_basic_map *bmap;
1370 isl_aff *lb;
1371 isl_basic_set *offset;
1372 isl_basic_map *shift;
1373 isl_basic_map *pre_shift;
1374 isl_map *sched;
1375 const char *name;
1376 struct cuda_array_bound *bounds;
1377 int n_index = group->array->n_index;
1379 bounds = group->private_bound;
1380 if (!bounds)
1381 bounds = group->shared_bound;
1383 dim = isl_set_get_space(access);
1384 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1385 offset = isl_basic_set_universe(dim);
1386 for (i = 0; i < n_index; ++i) {
1387 lb = isl_aff_copy(bounds[i].lb);
1388 bmap = isl_basic_map_from_aff(lb);
1389 bset = isl_basic_map_range(bmap);
1390 offset = isl_basic_set_flat_product(offset, bset);
1392 offset = isl_basic_set_neg(offset);
1394 dim = isl_space_map_from_set(isl_set_get_space(access));
1395 shift = isl_basic_map_identity(dim);
1396 shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1398 bset = isl_basic_set_universe(isl_set_get_space(access));
1399 bmap = isl_basic_map_from_domain_and_range(bset, offset);
1401 shift = isl_basic_map_sum(shift, bmap);
1403 dim = isl_set_get_space(access);
1404 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1405 dim = isl_space_map_from_set(dim);
1406 pre_shift = isl_basic_map_universe(isl_space_copy(dim));
1407 dim = isl_space_add_dims(dim, isl_dim_in, 1);
1408 dim = isl_space_add_dims(dim, isl_dim_out, 1);
1409 for (i = 0; i < n_index; ++i) {
1410 if (!bounds[i].shift_map)
1411 bmap = isl_basic_map_identity(isl_space_copy(dim));
1412 else
1413 bmap = isl_basic_map_copy(bounds[i].shift_map);
1414 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1416 isl_space_free(dim);
1417 name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1418 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1419 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1420 shift = isl_basic_map_apply_range(pre_shift, shift);
1422 sched = isl_map_from_basic_map(shift);
1423 sched = isl_map_intersect_domain(sched, access);
1425 return sched;
1428 /* Construct a schedule for iterating over all elements in the given
1429 * piece of an array. The schedule iterates over a copy of the piece
1430 * that is shifted to the origin.
1431 * We subsequently also perform the tiling/wrapping over the threads.
1433 * In particular, we tile the final iterators so that the final thread
1434 * dimension runs over the final array dimension.
1435 * However, if those final iterators have only a single iteration,
1436 * we try to tile earlier iterators instead.
1438 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1439 __isl_take isl_set *access, struct cuda_array_ref_group *group)
1441 isl_space *dim;
1442 isl_map *sched;
1443 isl_union_map *usched;
1444 isl_map *tiling;
1445 isl_set *par;
1446 unsigned nvar = isl_set_dim(access, isl_dim_set);
1447 int n_tile;
1448 int first;
1450 sched = shift_access(access, group);
1452 n_tile = gen->n_block;
1453 if (n_tile > nvar) {
1454 int i;
1455 sched = isl_map_insert_dims(sched,
1456 isl_dim_out, 0, n_tile - nvar);
1457 for (i = 0; i < n_tile - nvar; ++i)
1458 sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1459 nvar = n_tile;
1462 first = nvar - n_tile;
1464 for (; first > 0; first --)
1465 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1466 first + n_tile - 1, NULL))
1467 break;
1469 dim = isl_map_get_space(sched);
1470 dim = isl_space_params(dim);
1471 if (gen->options->wrap)
1472 tiling = wrap(isl_space_copy(dim), nvar, first,
1473 n_tile, gen->block_dim);
1474 else
1475 tiling = tile(isl_space_copy(dim), nvar, first,
1476 n_tile, gen->block_dim);
1477 sched = isl_map_apply_range(sched, tiling);
1479 par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1480 usched = isl_union_map_from_map(sched);
1481 usched = isl_union_map_intersect_range(usched,
1482 isl_union_set_from_set(par));
1484 usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1485 first, n_tile);
1487 return usched;
1490 /* Print an access to the element in the global memory copy of the
1491 * given array that corresponds to the element described by "pma".
1492 * of the original array.
1493 * The copy in global memory has been linearized, so we need to take
1494 * the array size into account.
1496 static void print_global_index(FILE *out,
1497 struct cuda_array_info *array, __isl_keep isl_pw_multi_aff *pma,
1498 __isl_keep isl_set *domain)
1500 int i;
1501 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1502 isl_printer *prn;
1504 if (cuda_array_is_scalar(array)) {
1505 fprintf(out, "*%s", array->name);
1506 return;
1509 fprintf(out, "%s[", array->name);
1510 prn = isl_printer_to_file(ctx, out);
1511 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1512 for (i = 0; i + 1 < array->n_index; ++i)
1513 prn = isl_printer_print_str(prn, "(");
1514 for (i = 0; i < array->n_index; ++i) {
1515 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1516 pa = isl_pw_aff_coalesce(pa);
1517 pa = isl_pw_aff_gist(pa, isl_set_copy(domain));
1518 if (i) {
1519 prn = isl_printer_print_str(prn, ") * (");
1520 prn = isl_printer_print_pw_aff(prn,
1521 array->local_bound[i]);
1522 prn = isl_printer_print_str(prn, ") + ");
1524 prn = isl_printer_print_pw_aff(prn, pa);
1525 isl_pw_aff_free(pa);
1527 isl_printer_free(prn);
1528 fprintf(out, "]");
1531 /* Given an index expression into a tile of an array, adjust the expression
1532 * to a shift of the tile to the origin
1533 * (based on the lower bounds in array->shared_bound).
1534 * If the index is strided, then we first add
1535 * bound->shift and divide by bound->stride.
1537 static __isl_give isl_pw_aff *shift_index(__isl_take isl_pw_aff *pa,
1538 struct cuda_array_info *array,
1539 struct cuda_array_bound *bound, __isl_take isl_set *domain)
1541 isl_aff *lb;
1542 isl_pw_aff *tmp;
1544 if (bound->shift) {
1545 isl_aff *shift;
1546 shift = bound->shift;
1547 shift = isl_aff_copy(shift);
1548 shift = isl_aff_project_domain_on_params(shift);
1549 shift = isl_aff_align_params(shift, isl_pw_aff_get_space(pa));
1550 tmp = isl_pw_aff_alloc(isl_set_copy(domain), shift);
1551 pa = isl_pw_aff_add(pa, tmp);
1552 pa = isl_pw_aff_scale_down(pa, bound->stride);
1555 lb = isl_aff_copy(bound->lb);
1556 lb = isl_aff_project_domain_on_params(lb);
1558 lb = isl_aff_align_params(lb, isl_pw_aff_get_space(pa));
1560 tmp = isl_pw_aff_alloc(isl_set_copy(domain), lb);
1561 pa = isl_pw_aff_sub(pa, tmp);
1562 pa = isl_pw_aff_coalesce(pa);
1563 pa = isl_pw_aff_gist(pa, domain);
1565 return pa;
1568 /* Print an access to the element in the private/shared memory copy of the
1569 * given array reference group that corresponds to the element described
1570 * by "pma" of the original array.
1571 * Since the array in private/shared memory is just a shifted copy of part
1572 * of the original array, we simply need to subtract the lower bound,
1573 * which was computed in can_tile_for_shared_memory.
1574 * If any of the indices is strided, then we first add
1575 * bounds[i].shift and divide by bounds[i].stride.
1577 static void print_local_index(FILE *out,
1578 struct cuda_array_ref_group *group, struct cuda_array_bound *bounds,
1579 __isl_keep isl_pw_multi_aff *pma, __isl_keep isl_set *domain)
1581 int i;
1582 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1583 isl_printer *prn;
1584 struct cuda_array_info *array = group->array;
1586 print_array_name(out, group);
1587 for (i = 0; i < array->n_index; ++i) {
1588 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1590 pa = shift_index(pa, array, &bounds[i], isl_set_copy(domain));
1592 fprintf(out, "[");
1593 prn = isl_printer_to_file(ctx, out);
1594 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1595 prn = isl_printer_print_pw_aff(prn, pa);
1596 isl_printer_free(prn);
1597 fprintf(out, "]");
1598 isl_pw_aff_free(pa);
1602 /* This function is called for each leaf in the clast of the code
1603 * for copying to or from shared/private memory.
1604 * The statement name is {read,write}_{shared,private}_<array>.
1606 * The schedule iterates over the array elements, so we can use
1607 * the domain of copy_sched at the current scheduling position
1608 * as the index of the array.
1610 static void print_copy_statement(struct clast_printer_info *code,
1611 struct clast_user_stmt *u)
1613 struct cuda_gen *gen = code->user;
1614 isl_set *domain;
1615 isl_map *sched;
1616 struct cuda_array_ref_group *group = gen->copy_group;
1617 struct cuda_array_bound *bounds = gen->copy_bound;
1618 unsigned n_in;
1619 unsigned n_out;
1620 isl_space *dim;
1621 isl_set *param;
1622 isl_set *index;
1623 isl_pw_multi_aff *pma;
1624 int read;
1626 read = !strncmp(u->statement->name, "read", 4);
1628 domain = extract_host_domain(u);
1629 assert(domain);
1631 sched = isl_map_copy(gen->copy_sched);
1632 sched = isl_map_reverse(sched);
1633 sched = isl_map_intersect_domain(sched, domain);
1634 n_in = isl_map_dim(sched, isl_dim_in);
1635 n_out = isl_map_dim(sched, isl_dim_out);
1636 dim = isl_map_get_space(sched);
1637 dim = isl_space_drop_dims(dim, isl_dim_in, 0, n_in);
1638 dim = isl_space_drop_dims(dim, isl_dim_out, 0, n_out);
1639 param = parametrization(dim, n_in, 0, n_in, "c");
1640 sched = isl_map_align_params(sched, isl_set_get_space(param));
1641 sched = isl_map_intersect_domain(sched, param);
1642 index = isl_map_range(sched);
1643 domain = isl_set_copy(index);
1644 pma = isl_pw_multi_aff_from_set(index);
1645 pma = isl_pw_multi_aff_coalesce(pma);
1646 domain = isl_set_params(domain);
1648 print_indent(code->dst, code->indent);
1649 if (read) {
1650 print_local_index(code->dst, group, bounds, pma, domain);
1651 fprintf(code->dst, " = ");
1652 print_global_index(code->dst, group->array, pma, domain);
1653 } else {
1654 print_global_index(code->dst, group->array, pma, domain);
1655 fprintf(code->dst, " = ");
1656 print_local_index(code->dst, group, bounds, pma, domain);
1658 fprintf(code->dst, ";\n");
1660 isl_pw_multi_aff_free(pma);
1661 isl_set_free(domain);
1664 static void print_shared_access(struct cuda_gen *gen,
1665 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1666 const char *type, struct cuda_array_ref_group *group)
1668 const char *array_name;
1669 char *name;
1670 isl_ctx *ctx;
1671 isl_union_map *sched;
1672 unsigned nvar = isl_set_dim(access, isl_dim_set);
1673 int n_tile;
1675 ctx = isl_set_get_ctx(access);
1676 array_name = isl_set_get_tuple_name(access);
1677 name = isl_alloc_array(ctx, char,
1678 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1679 if (group->array->n_group > 1)
1680 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1681 else
1682 sprintf(name, "%s_shared_%s", type, array_name);
1683 access = isl_set_set_tuple_name(access, name);
1684 free(name);
1686 sched = access_schedule(gen, access, group);
1688 n_tile = gen->n_block;
1689 if (n_tile > nvar)
1690 n_tile = nvar;
1692 gen->copy_sched = isl_map_from_union_map(isl_union_map_copy(sched));
1693 gen->copy_group = group;
1694 gen->copy_bound = group->shared_bound;
1696 print_shared_body(gen, shared_domain, sched, nvar + n_tile,
1697 &print_copy_statement, -1);
1699 isl_union_map_free(sched);
1700 isl_map_free(gen->copy_sched);
1703 /* Return the union of all read (read = 1) and/or write (write = 1)
1704 * access relations in the group.
1706 static __isl_give isl_union_map *group_access_relation(
1707 struct cuda_array_ref_group *group, int read, int write)
1709 int i;
1710 isl_union_map *access;
1712 access = isl_union_map_empty(isl_map_get_space(group->access));
1713 for (i = 0; i < group->n_ref; ++i) {
1714 isl_map *map_i;
1716 if (!((read && group->refs[i]->read) ||
1717 (write && group->refs[i]->write)))
1718 continue;
1719 map_i = isl_map_copy(group->refs[i]->access);
1720 access = isl_union_map_union(access,
1721 isl_union_map_from_map(map_i));
1724 return access;
1727 /* Check that none of the shared memory tiles involve any strides.
1729 static int no_strides(struct cuda_array_ref_group *group)
1731 int i;
1732 int n_index = group->array->n_index;
1734 for (i = 0; i < n_index; ++i)
1735 if (group->shared_bound[i].shift)
1736 return 0;
1738 return 1;
1741 /* Return a set containing the values of the given index i
1742 * of the elements in the array tile in global memory that corresponds
1743 * to the shared memory copy.
1744 * In particular, if a is the index, we return a set with constraints
1746 * tile_offset <= a <= tile_offset + tile_size - 1
1748 * and
1750 * 0 <= a <= array_size - 1
1753 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1754 int i)
1756 isl_basic_set *tile;
1757 isl_aff *aff;
1758 isl_constraint *c;
1759 isl_local_space *ls;
1760 isl_pw_aff *bound;
1761 isl_set *dom;
1762 isl_set *tile_set;
1764 aff = isl_aff_copy(group->shared_bound[i].lb);
1765 aff = isl_aff_add_dims(aff, isl_dim_in, 1);
1766 ls = isl_aff_get_domain_local_space(aff);
1767 aff = isl_aff_neg(aff);
1768 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1769 c = isl_inequality_from_aff(isl_aff_copy(aff));
1770 tile = isl_basic_set_from_constraint(c);
1772 aff = isl_aff_neg(aff);
1773 aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1774 aff = isl_aff_add_constant_si(aff, -1);
1775 c = isl_inequality_from_aff(aff);
1776 tile = isl_basic_set_add_constraint(tile, c);
1778 aff = isl_aff_zero_on_domain(ls);
1779 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1780 c = isl_inequality_from_aff(aff);
1781 tile = isl_basic_set_add_constraint(tile, c);
1783 bound = isl_pw_aff_copy(group->array->bound[i]);
1784 bound = isl_pw_aff_add_dims(bound, isl_dim_in, 1);
1785 ls = isl_local_space_from_space(isl_pw_aff_get_domain_space(bound));
1786 aff = isl_aff_zero_on_domain(ls);
1787 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1788 aff = isl_aff_add_constant_si(aff, 1);
1789 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1791 tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1792 tile_set = isl_set_align_params(tile_set, isl_basic_set_get_space(tile));
1793 tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1795 return tile_set;
1798 /* Return a set containing the elements in the array tile in
1799 * global memory that corresponds to the shared memory copy.
1801 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1803 int i;
1804 int n_index = group->array->n_index;
1805 isl_set *tile;
1807 tile = group_tile_dim(group, 0);
1808 for (i = 1; i < n_index; ++i) {
1809 isl_set *tile_i;
1811 tile_i = group_tile_dim(group, i);
1812 tile = isl_set_flat_product(tile, tile_i);
1815 tile = isl_set_set_tuple_name(tile, group->array->name);
1817 return tile;
1820 /* Print code for reading into or writing from shared memory
1821 * the given array reference group.
1823 * sched maps the original iteration domains to the shared memory tile loops.
1825 * If we are performing a read from global memory to shared memory,
1826 * if the array involved is not a scalar and if the definition of the
1827 * shared memory tiles does not involve any strides, then we copy
1828 * the entire tile to shared memory. This may result in some extra
1829 * elements getting copied, but it should lead to simpler code
1830 * (which means that fewer registers may be needed) and less divergence.
1832 * Otherwise, we only copy the elements that will be read or have been written
1833 * in the kernel.
1835 * Note that the absence of stride requirement can easily be lifted.
1836 * We would just need to add constraints of the form
1838 * shift + a = stride * alpha
1840 static int print_group_shared_accesses(struct cuda_gen *gen,
1841 struct cuda_array_ref_group *group, const char *type,
1842 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1844 int read;
1845 isl_union_map *access;
1846 isl_union_set *uset;
1847 isl_set *access_set;
1849 if (group->private_bound)
1850 return 0;
1851 if (!group->shared_bound)
1852 return 0;
1854 read = !strcmp(type, "read");
1856 access = group_access_relation(group, read, !read);
1857 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1858 uset = isl_union_map_range(access);
1860 if (isl_union_set_is_empty(uset)) {
1861 isl_union_set_free(uset);
1862 return 0;
1865 if (read && group->array->n_index > 0 && no_strides(group)) {
1866 isl_union_set_free(uset);
1867 access_set = group_tile(group);
1868 print_shared_access(gen, shared_domain, access_set,
1869 type, group);
1870 return 1;
1873 access_set = isl_set_from_union_set(uset);
1874 access_set = isl_set_coalesce(access_set);
1876 print_shared_access(gen, shared_domain, access_set, type, group);
1878 return 1;
1881 /* Print code for reading into or writing from shared memory at
1882 * the given level (-1 for innermost).
1884 * If we are not printing at the innermost level, then the dimensionality
1885 * of shared_domain may be smaller than gen->shared_len.
1886 * As the rest of the code assumes that the domain of access has
1887 * gen->shared_len dimensions, we therefore may need to embed this domain
1888 * in a higher dimensional space after intersection with shared_domain.
1890 static void print_shared_accesses(struct cuda_gen *gen,
1891 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1892 const char *type, int level)
1894 int i, j;
1895 isl_space *dim;
1896 isl_map *proj;
1897 isl_set *par;
1898 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1899 int sync = 0;
1900 isl_union_map *sched;
1902 shared_domain = isl_set_copy(shared_domain);
1903 sched = isl_union_map_copy(gen->tiled_sched);
1904 dim = isl_union_map_get_space(sched);
1905 proj = projection(dim, gen->tiled_len, shared_len);
1906 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1907 sched = isl_union_map_intersect_range(sched,
1908 isl_union_set_from_set(isl_set_copy(shared_domain)));
1909 if (shared_len != gen->shared_len) {
1910 dim = isl_union_map_get_space(sched);
1911 proj = projection(dim, gen->shared_len, shared_len);
1912 proj = isl_map_reverse(proj);
1913 shared_domain = isl_set_apply(shared_domain,
1914 isl_map_copy(proj));
1915 sched = isl_union_map_apply_range(sched,
1916 isl_union_map_from_map(proj));
1919 dim = isl_union_map_get_space(sched);
1920 par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1921 sched = isl_union_map_intersect_range(sched,
1922 isl_union_set_from_set(par));
1924 for (i = 0; i < gen->n_array; ++i) {
1925 struct cuda_array_info *array = &gen->array[i];
1927 for (j = 0; j < array->n_group; ++j) {
1928 if (array->groups[j]->print_shared_level != level)
1929 continue;
1931 if (print_group_shared_accesses(gen, array->groups[j],
1932 type, shared_domain, sched))
1933 sync = 1;
1937 isl_union_map_free(sched);
1938 isl_set_free(shared_domain);
1940 if (sync) {
1941 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1942 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1946 /* This function is called for each access to an array in some statement
1947 * in the original code.
1948 * Replace that access by an access to shared or (linearized) global memory.
1949 * Since the array in shared memory is just
1950 * a shifted copy of part of the original array, we simply need
1951 * to subtract the lower bound, which was computed
1952 * in can_tile_for_shared_memory.
1953 * If any of the indices is strided, then we first add
1954 * shared_bound[i].shift and divide by shared_bound[i].stride.
1956 * If the given array is accessed directly from global memory,
1957 * we don't need to perform any shifting and simply simplify
1958 * expression in the context of the domain instead.
1960 * If the array space (range of access) has no name, then we are
1961 * accessing an iterator in the original program.
1963 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1964 int group_nr)
1966 int i;
1967 const char *name;
1968 unsigned n_index;
1969 struct cuda_array_info *array = NULL;
1970 isl_printer *prn;
1971 isl_pw_multi_aff *pma;
1972 isl_set *data_set;
1973 isl_set *domain;
1974 struct cuda_array_bound *bounds = NULL;
1976 access = isl_map_align_params(access,
1977 isl_set_get_space(gen->stmt_domain));
1979 data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1981 name = isl_set_get_tuple_name(data_set);
1983 if (!name)
1984 fprintf(gen->cuda.kernel_c, "(");
1985 else {
1986 struct cuda_array_ref_group *group;
1988 for (i = 0; i < gen->n_array; ++i) {
1989 if (strcmp(name, gen->array[i].name))
1990 continue;
1991 array = &gen->array[i];
1993 assert(array);
1994 group = array->groups[group_nr];
1995 bounds = group->private_bound;
1996 if (!bounds)
1997 bounds = group->shared_bound;
1999 if (!bounds && cuda_array_is_scalar(array) && !array->read_only)
2000 fprintf(gen->cuda.kernel_c, "*");
2001 print_array_name(gen->cuda.kernel_c, group);
2003 if (cuda_array_is_scalar(array)) {
2004 isl_set_free(data_set);
2005 return;
2008 fprintf(gen->cuda.kernel_c, "[");
2012 n_index = isl_set_dim(data_set, isl_dim_set);
2013 pma = isl_pw_multi_aff_from_set(data_set);
2014 pma = isl_pw_multi_aff_coalesce(pma);
2016 prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
2017 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
2019 if (!bounds)
2020 for (i = 0; i + 1 < n_index; ++i)
2021 prn = isl_printer_print_str(prn, "(");
2023 for (i = 0; i < n_index; ++i) {
2024 isl_pw_aff *index;
2026 index = isl_pw_multi_aff_get_pw_aff(pma, i);
2028 if (!array) {
2029 prn = isl_printer_print_pw_aff(prn, index);
2030 isl_pw_aff_free(index);
2031 continue;
2034 domain = isl_set_copy(gen->stmt_domain);
2035 domain = isl_set_params(domain);
2036 if (!bounds) {
2037 index = isl_pw_aff_coalesce(index);
2038 index = isl_pw_aff_gist(index, domain);
2039 } else
2040 index = shift_index(index, array, &bounds[i], domain);
2042 if (i) {
2043 if (!bounds) {
2044 prn = isl_printer_print_str(prn, ") * (");
2045 prn = isl_printer_print_pw_aff(prn,
2046 array->local_bound[i]);
2047 prn = isl_printer_print_str(prn, ") + ");
2048 } else
2049 prn = isl_printer_print_str(prn, "][");
2051 prn = isl_printer_print_pw_aff(prn, index);
2052 isl_pw_aff_free(index);
2054 if (!name)
2055 prn = isl_printer_print_str(prn, ")");
2056 else
2057 prn = isl_printer_print_str(prn, "]");
2058 isl_printer_free(prn);
2060 isl_pw_multi_aff_free(pma);
2063 struct cuda_access_print_info {
2064 struct cuda_gen *gen;
2065 struct cuda_stmt_access *access;
2068 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
2069 * with the pet printer. This means that whenever the pet printer prints a
2070 * pet access expression we have the corresponding cuda access available and can
2071 * print the modified access.
2073 static void print_cuda_access(struct pet_expr *expr, void *usr)
2075 struct cuda_access_print_info *info =
2076 (struct cuda_access_print_info *) usr;
2077 print_access(info->gen, isl_map_copy(info->access->access),
2078 info->access->group);
2079 info->access = info->access->next;
2082 static void print_stmt_body(struct cuda_gen *gen,
2083 FILE *out, struct cuda_stmt *stmt)
2085 struct cuda_access_print_info info;
2087 info.gen = gen;
2088 info.access = stmt->accesses;
2090 print_pet_expr(out, stmt->body, print_cuda_access, &info);
2091 fprintf(out, ";\n");
2094 /* This function is called for each leaf in the innermost clast,
2095 * i.e., for each statement.
2096 * We print the statement body, simplifying the accesses based
2097 * on the schedule.
2099 static void print_statement(struct clast_printer_info *code,
2100 struct clast_user_stmt *u)
2102 struct cuda_gen *gen = code->user;
2103 isl_space *dim;
2104 isl_set *par;
2105 isl_set *stmt_domain;
2106 isl_union_map *stmt_sched;
2107 isl_union_set *uset;
2108 int nr;
2109 struct cuda_stmt *stmt;
2111 nr = atoi(u->statement->name + 2);
2112 stmt = &gen->stmts[nr];
2114 stmt_domain = extract_host_domain(u);
2116 stmt_sched = isl_union_map_intersect_range(
2117 isl_union_map_copy(gen->local_sched),
2118 isl_union_set_from_set(extend(stmt_domain,
2119 gen->thread_tiled_len)));
2120 dim = isl_union_map_get_space(stmt_sched);
2121 par = parametrization(dim, gen->thread_tiled_len, 0,
2122 gen->thread_tiled_len, "c");
2123 stmt_sched = isl_union_map_intersect_range(stmt_sched,
2124 isl_union_set_from_set(par));
2126 uset = isl_union_map_domain(stmt_sched);
2127 dim = isl_union_set_get_space(uset);
2128 dim = isl_space_add_dims(dim, isl_dim_set,
2129 isl_set_dim(stmt->domain, isl_dim_set));
2130 dim = isl_space_set_tuple_name(dim, isl_dim_set, u->statement->name);
2131 gen->stmt_domain = isl_union_set_extract_set(uset, dim);
2132 isl_union_set_free(uset);
2134 print_indent(code->dst, code->indent);
2135 print_stmt_body(gen, code->dst, stmt);
2137 isl_set_free(gen->stmt_domain);
2140 static void print_private_access(struct cuda_gen *gen,
2141 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
2142 const char *type, struct cuda_array_ref_group *group)
2144 const char *array_name;
2145 char *name;
2146 isl_ctx *ctx;
2147 unsigned nvar = isl_set_dim(access, isl_dim_set);
2148 isl_union_map *usched;
2150 if (isl_set_fast_is_empty(access)) {
2151 isl_set_free(access);
2152 return;
2155 ctx = isl_set_get_ctx(access);
2156 array_name = isl_set_get_tuple_name(access);
2157 name = isl_alloc_array(ctx, char,
2158 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
2159 if (group->array->n_group > 1)
2160 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
2161 else
2162 sprintf(name, "%s_private_%s", type, array_name);
2163 access = isl_set_set_tuple_name(access, name);
2164 free(name);
2166 gen->copy_sched = shift_access(access, group);
2167 gen->copy_group = group;
2168 gen->copy_bound = group->private_bound;
2170 usched = isl_union_map_from_map(isl_map_copy(gen->copy_sched));
2171 print_shared_body(gen, shared_domain, usched, nvar,
2172 &print_copy_statement, 1);
2173 isl_union_map_free(usched);
2175 isl_map_free(gen->copy_sched);
2178 /* Print code for reading into or writing from private memory
2179 * the given array reference group.
2181 * sched maps the original iteration domains to the shared memory tile loops.
2183 static void print_group_private_accesses(struct cuda_gen *gen,
2184 struct cuda_array_ref_group *group,
2185 const char *type, __isl_keep isl_set *shared_domain,
2186 unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2188 int read;
2189 isl_union_map *access;
2190 isl_union_set *uset;
2191 isl_set *access_set;
2193 if (!group->private_bound)
2194 return;
2196 read = !strcmp(type, "read");
2198 access = group_access_relation(group, read, !read);
2199 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2200 access = isl_union_map_intersect(access,
2201 isl_union_map_copy(gen->private_access));
2202 uset = isl_union_map_range(access);
2204 if (isl_union_set_is_empty(uset)) {
2205 isl_union_set_free(uset);
2206 return;
2209 access_set = isl_set_from_union_set(uset);
2210 access_set = isl_set_coalesce(access_set);
2211 access_set = isl_set_eliminate(access_set, isl_dim_param,
2212 first_shared + shared_len,
2213 gen->shared_len - shared_len);
2215 print_private_access(gen, shared_domain, access_set, type, group);
2218 /* Print code for reading into or writing from private memory at
2219 * the given level (-1 for innermost).
2221 * If we are not printing at the innermost level, then the dimensionality
2222 * of shared_domain may be smaller than gen->shared_len.
2223 * As the rest of the code assumes that the domain of access has
2224 * gen->shared_len dimensions, we therefore may need to embed this domain
2225 * in a higher dimensional space after intersection with shared_domain.
2227 * This code is very similar to print_shared_accesses.
2228 * The main difference is that we to take into account gen->private_access.
2230 static void print_private_accesses(struct cuda_gen *gen,
2231 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2232 const char *type, int level)
2234 int i, j;
2235 isl_space *dim;
2236 isl_map *proj;
2237 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2238 unsigned first_shared;
2239 isl_union_map *sched;
2241 shared_domain = isl_set_copy(shared_domain);
2242 sched = isl_union_map_copy(gen->tiled_sched);
2243 dim = isl_union_map_get_space(sched);
2244 first_shared = isl_space_dim(dim, isl_dim_param);
2245 proj = projection(dim, gen->tiled_len, shared_len);
2246 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2247 sched = isl_union_map_intersect_range(sched,
2248 isl_union_set_from_set(isl_set_copy(shared_domain)));
2249 if (shared_len != gen->shared_len) {
2250 dim = isl_union_map_get_space(sched);
2251 proj = projection(dim, gen->shared_len, shared_len);
2252 proj = isl_map_reverse(proj);
2253 shared_domain = isl_set_apply(shared_domain,
2254 isl_map_copy(proj));
2255 sched = isl_union_map_apply_range(sched,
2256 isl_union_map_from_map(proj));
2259 for (i = 0; i < gen->n_array; ++i) {
2260 struct cuda_array_info *array = &gen->array[i];
2262 for (j = 0; j < array->n_group; ++j) {
2263 if (array->groups[j]->print_shared_level != level)
2264 continue;
2266 print_group_private_accesses(gen, array->groups[j],
2267 type, shared_domain,
2268 first_shared, shared_len, sched);
2272 isl_union_map_free(sched);
2273 isl_set_free(shared_domain);
2276 /* Set unroll[j] if the input dimension j is involved in
2277 * the index expression represented by bmap.
2279 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2281 int i, j;
2282 int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2283 int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2284 int *unroll = user;
2286 for (i = 0; i < n_out; ++i) {
2287 isl_constraint *c;
2288 int ok;
2290 ok = isl_basic_map_has_defining_equality(bmap,
2291 isl_dim_out, i, &c);
2292 assert(ok);
2293 for (j = 0; j < n_in; ++j)
2294 if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2295 unroll[j] = 1;
2296 isl_constraint_free(c);
2299 isl_basic_map_free(bmap);
2300 return 0;
2303 /* Given an array pos mapping input dimensions to the corresponding
2304 * output dimension, construct the corresponding map.
2306 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
2307 int *pos, int len)
2309 int i;
2310 isl_constraint *c;
2311 isl_basic_map *bmap;
2312 isl_local_space *ls;
2314 dim = isl_space_add_dims(dim, isl_dim_in, len);
2315 dim = isl_space_add_dims(dim, isl_dim_out, len);
2316 bmap = isl_basic_map_universe(isl_space_copy(dim));
2317 ls = isl_local_space_from_space(dim);
2319 for (i = 0; i < len; ++i) {
2320 c = isl_equality_alloc(isl_local_space_copy(ls));
2321 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2322 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2323 bmap = isl_basic_map_add_constraint(bmap, c);
2325 isl_local_space_free(ls);
2327 return isl_map_from_basic_map(bmap);
2330 /* Find all loops involved in any of the index expressions for any of
2331 * the private accesses, move them innermost and then mark them as
2332 * requiring unrolling by setting gen->first_unroll.
2333 * The loops involved should all be parallel because of the checks
2334 * we performed in check_private_group_access. Moving them innermost
2335 * is therefore a valid transformation.
2337 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2338 __isl_take isl_union_map *sched)
2340 int i, j;
2341 int unroll[gen->thread_tiled_len];
2342 int perm[gen->thread_tiled_len];
2343 isl_space *dim;
2344 isl_map *permute;
2345 int len = gen->shared_len + gen->n_parallel + gen->n_block;
2347 gen->first_unroll = -1;
2349 for (i = 0; i < gen->thread_tiled_len; ++i)
2350 unroll[i] = 0;
2351 for (i = 0; i < gen->n_array; ++i) {
2352 struct cuda_array_info *array = &gen->array[i];
2354 for (j = 0; j < array->n_group; ++j) {
2355 isl_union_map *access;
2356 isl_map *acc;
2358 if (!array->groups[j]->private_bound)
2359 continue;
2361 access = group_access_relation(array->groups[j], 1, 1);
2362 access = isl_union_map_apply_domain(access,
2363 isl_union_map_copy(sched));
2365 acc = isl_map_from_union_map(access);
2366 isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2368 isl_map_free(acc);
2372 for (i = 0; i < gen->shared_len; ++i)
2373 if (unroll[i])
2374 return sched;
2376 for (i = gen->shared_len; i < len; ++i)
2377 if (unroll[i])
2378 break;
2380 if (i >= len)
2381 return sched;
2383 for (i = len; i < gen->thread_tiled_len; ++i)
2384 if (unroll[i])
2385 return sched;
2387 j = 0;
2388 for (i = 0; i < gen->thread_tiled_len; ++i)
2389 if (!unroll[i])
2390 perm[i] = j++;
2391 gen->first_unroll = 1 + j;
2392 for (i = 0; i < len; ++i)
2393 if (unroll[i])
2394 perm[i] = j++;
2396 dim = isl_union_map_get_space(sched);
2397 permute = permutation(dim, perm, gen->thread_tiled_len);
2398 sched = isl_union_map_apply_range(sched,
2399 isl_union_map_from_map(permute));
2401 return sched;
2404 /* This function is called for each leaf in the clast of the kernel code.
2405 * We first specialize the schedule to the site of the leaf and
2406 * print code for reading into shared memory, performing the actual
2407 * computations and writing from shared memory, with the required
2408 * synchronizations.
2410 static void print_kernel_user(struct clast_printer_info *code,
2411 struct clast_user_stmt *u)
2413 struct cuda_gen *gen = code->user;
2414 isl_set *shared_domain;
2416 shared_domain = extract_entire_host_domain(&u->stmt);
2418 print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2420 print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2422 print_shared_body(gen, shared_domain, gen->local_sched,
2423 gen->thread_tiled_len, &print_statement,
2424 gen->first_unroll);
2426 print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2428 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2429 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2431 print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2433 isl_set_free(shared_domain);
2436 /* Check if we need to perform any copying to shared memory at this level
2437 * and if so, print the copying instructions.
2438 * Any array for which we are allowed to print copying instructions at
2439 * this level, but haven't done so already, is printed.
2441 static void copy_to_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2443 int i, j;
2444 int level;
2445 int print = 0;
2447 level = isl_set_dim(domain, isl_dim_set);
2449 for (i = 0; i < gen->n_array; ++i) {
2450 struct cuda_array_info *array = &gen->array[i];
2452 for (j = 0; j < array->n_group; ++j) {
2453 if (array->groups[j]->print_shared_level >= 0)
2454 continue;
2455 if (array->groups[j]->last_shared >= level)
2456 continue;
2457 array->groups[j]->print_shared_level = level;
2458 print = 1;
2462 if (print) {
2463 print_shared_accesses(gen, domain, gen->read, "read", level);
2464 print_private_accesses(gen, domain, gen->read, "read", level);
2469 /* This function is called for each for loop in the clast,
2470 * right after the opening brace has been printed.
2472 * Print copying instructions to shared or private memory if needed.
2474 static void print_kernel_for_head(struct clast_printer_info *code,
2475 struct clast_for *f)
2477 struct cuda_gen *gen = code->user;
2478 isl_set *domain;
2480 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2481 copy_to_local(gen, domain);
2483 isl_set_free(domain);
2486 /* Print instructions for copying from shared memory for each array
2487 * for which print_kernel_for_head has added copying instructions
2488 * to shared memory.
2490 static void copy_from_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2492 int i, j;
2493 int level;
2494 int print = 0;
2496 level = isl_set_dim(domain, isl_dim_set);
2498 for (i = 0; i < gen->n_array; ++i) {
2499 struct cuda_array_info *array = &gen->array[i];
2501 for (j = 0; j < array->n_group; ++j) {
2502 if (array->groups[j]->print_shared_level != level)
2503 continue;
2504 print = 1;
2505 break;
2507 if (print)
2508 break;
2511 if (print) {
2512 print_private_accesses(gen, domain, gen->write, "write", level);
2513 print_shared_accesses(gen, domain, gen->write, "write", level);
2517 /* This function is called for each for loop in the clast,
2518 * right before the closing brace is printed.
2520 * Print copying instructions from shared or private memory if needed.
2522 static void print_kernel_for_foot(struct clast_printer_info *code,
2523 struct clast_for *f)
2525 struct cuda_gen *gen = code->user;
2526 isl_set *domain;
2528 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2529 copy_from_local(gen, domain);
2531 isl_set_free(domain);
2534 /* Use CLooG to generate code for the outer gen->shared_first loops
2535 * of the local schedule "sched".
2536 * The pretty printing of this code is handled by print_clast,
2537 * which calls print_kernel_user for each iteration of the shared tile loops.
2539 static void print_cloog_kernel_body(struct cuda_gen *gen,
2540 __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2542 int i;
2543 CloogOptions *options;
2544 CloogDomain *cloog_context;
2545 CloogUnionDomain *ud;
2546 CloogInput *input;
2547 struct clast_stmt *stmt;
2548 char name[20];
2550 sched = isl_union_map_copy(sched);
2551 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
2553 options = cloog_options_malloc(gen->state);
2554 options->language = CLOOG_LANGUAGE_C;
2555 options->strides = 1;
2556 options->sh = 1;
2557 options->stop = gen->shared_len;
2558 options->f = gen->tiled_len;
2559 options->l = gen->tiled_len;
2560 options->save_domains = 1;
2561 options->noscalars = 1;
2563 ud = cloog_union_domain_from_isl_union_map(sched);
2564 for (i = 0; i < gen->shared_len; ++i) {
2565 snprintf(name, sizeof(name), "g%d", i);
2566 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2568 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2569 input = cloog_input_alloc(cloog_context, ud);
2571 stmt = cloog_clast_create_from_input(input, options);
2573 gen->kernel_code.indent = 4;
2574 gen->kernel_code.dst = gen->cuda.kernel_c;
2575 gen->kernel_code.print_user_stmt = NULL;
2576 gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2577 gen->kernel_code.print_for_head = &print_kernel_for_head;
2578 gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2579 gen->kernel_code.user = gen;
2580 copy_to_local(gen, context);
2581 print_clast(&gen->kernel_code, stmt);
2582 copy_from_local(gen, context);
2584 cloog_clast_free(stmt);
2585 cloog_options_free(options);
2588 static void print_kernel_iterators(struct cuda_gen *gen)
2590 int i;
2591 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2592 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2593 "threadIdx.z" };
2595 if (gen->n_grid > 0) {
2596 print_indent(gen->cuda.kernel_c, 4);
2597 fprintf(gen->cuda.kernel_c, "int ");
2598 for (i = 0; i < gen->n_grid; ++i) {
2599 if (i)
2600 fprintf(gen->cuda.kernel_c, ", ");
2601 fprintf(gen->cuda.kernel_c, "b%d = %s",
2602 i, block_dims[gen->n_grid - 1 - i]);
2604 fprintf(gen->cuda.kernel_c, ";\n");
2607 if (gen->n_block > 0) {
2608 print_indent(gen->cuda.kernel_c, 4);
2609 fprintf(gen->cuda.kernel_c, "int ");
2610 for (i = 0; i < gen->n_block; ++i) {
2611 if (i)
2612 fprintf(gen->cuda.kernel_c, ", ");
2613 fprintf(gen->cuda.kernel_c, "t%d = %s",
2614 i, thread_dims[gen->n_block - 1 - i]);
2616 fprintf(gen->cuda.kernel_c, ";\n");
2620 static void print_group_shared_array(struct cuda_gen *gen,
2621 struct cuda_array_ref_group *group)
2623 int j;
2624 struct cuda_array_bound *bounds;
2626 bounds = group->private_bound;
2627 if (!bounds)
2628 bounds = group->shared_bound;
2629 if (!bounds)
2630 return;
2632 print_indent(gen->cuda.kernel_c, 4);
2633 fprintf(gen->cuda.kernel_c, "%s%s ",
2634 group->private_bound ? "" : "__shared__ ", group->array->type);
2635 print_array_name(gen->cuda.kernel_c, group);
2636 for (j = 0; j < group->array->n_index; ++j) {
2637 fprintf(gen->cuda.kernel_c, "[");
2638 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2639 fprintf(gen->cuda.kernel_c, "]");
2641 fprintf(gen->cuda.kernel_c, ";\n");
2644 static void print_shared_arrays(struct cuda_gen *gen)
2646 int i, j;
2648 for (i = 0; i < gen->n_array; ++i) {
2649 struct cuda_array_info *array = &gen->array[i];
2651 for (j = 0; j < array->n_group; ++j)
2652 print_group_shared_array(gen, array->groups[j]);
2656 static void print_kernel_body(struct cuda_gen *gen,
2657 __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2659 isl_set *context;
2661 context = isl_set_copy(host_domain);
2662 context = parametrize(context, 0, gen->tile_first, "h");
2663 context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2664 context = add_bounded_parameters(context,
2665 gen->n_grid, gen->grid_dim, "b");
2667 print_kernel_iterators(gen);
2668 print_shared_arrays(gen);
2670 fprintf(gen->cuda.kernel_c, "\n");
2672 print_cloog_kernel_body(gen, context, sched);
2674 isl_set_free(context);
2677 /* Given a constraint
2679 * a(p,i) + j = g f(e)
2681 * or -a(p,i) - j = g f(e) if sign < 0,
2682 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2683 * a(p,i) is assumed to be an expression in only the parameters.
2685 static void extract_stride(__isl_keep isl_constraint *c,
2686 struct cuda_array_bound *bound, isl_int stride, int sign)
2688 int i;
2689 isl_int v;
2690 isl_space *dim;
2691 unsigned nparam;
2692 isl_aff *aff;
2694 isl_int_set(bound->stride, stride);
2696 dim = isl_constraint_get_space(c);
2697 dim = isl_space_params(dim);
2699 nparam = isl_space_dim(dim, isl_dim_param);
2701 isl_int_init(v);
2703 isl_constraint_get_constant(c, &v);
2704 if (sign < 0)
2705 isl_int_neg(v, v);
2706 aff = isl_aff_zero_on_domain(isl_local_space_from_space(dim));
2707 aff = isl_aff_set_constant(aff, v);
2709 for (i = 0; i < nparam; ++i) {
2710 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2711 if (isl_int_is_zero(v))
2712 continue;
2713 if (sign < 0)
2714 isl_int_neg(v, v);
2715 aff = isl_aff_add_coefficient(aff, isl_dim_param, i, v);
2718 isl_int_clear(v);
2720 bound->shift = aff;
2723 /* Given an equality constraint of a map with a single output dimension j,
2724 * check if the constraint is of the form
2726 * a(p,i) + j = g f(e)
2728 * with a(p,i) an expression in the parameters and input dimensions
2729 * and f(e) an expression in the existentially quantified variables.
2730 * If so, and if g is larger than any such g from a previously considered
2731 * constraint, then call extract_stride. to record the stride information
2732 * in bound.
2734 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2736 int i;
2737 isl_int v, stride;
2738 unsigned n_div;
2739 struct cuda_array_bound *bound = user;
2741 isl_int_init(v);
2742 isl_int_init(stride);
2744 n_div = isl_constraint_dim(c, isl_dim_div);
2745 isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2747 if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2748 int s = isl_int_sgn(v);
2749 isl_int_set_si(stride, 0);
2750 for (i = 0; i < n_div; ++i) {
2751 isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2752 isl_int_gcd(stride, stride, v);
2754 if (!isl_int_is_zero(stride) &&
2755 isl_int_gt(stride, bound->stride))
2756 extract_stride(c, bound, stride, s);
2759 isl_int_clear(stride);
2760 isl_int_clear(v);
2762 isl_constraint_free(c);
2763 return 0;
2766 /* Given contraints on an array index i, check if we can find
2767 * a shift a(p) and a stride g such that
2769 * a(p) + i = 0 mod g
2771 * If so, record the information in bound and apply the mapping
2772 * i -> (i + a(p))/g to the array index in bounds and return
2773 * the new constraints.
2774 * If not, simply return the original constraints.
2776 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2777 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2779 isl_basic_map *aff;
2780 isl_basic_map *shift;
2781 isl_aff *aff_shift;
2783 isl_int_set_si(bound->stride, -1);
2785 aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2787 isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2789 isl_basic_map_free(aff);
2791 if (isl_int_is_neg(bound->stride))
2792 return bounds;
2794 aff_shift = isl_aff_copy(bound->shift);
2795 aff_shift = isl_aff_add_dims(aff_shift, isl_dim_in, 1);
2796 aff_shift = isl_aff_add_coefficient_si(aff_shift, isl_dim_in, 0, 1);
2797 aff_shift = isl_aff_scale_down(aff_shift, bound->stride);
2798 shift = isl_basic_map_from_aff(aff_shift);
2800 bound->shift_map = isl_basic_map_copy(shift);
2801 bounds = isl_basic_map_apply_range(bounds, shift);
2803 return bounds;
2806 struct cuda_size_info {
2807 isl_basic_set *bset;
2808 struct cuda_array_bound *bound;
2809 int pos;
2812 /* Given a constraint from the basic set describing the bounds on
2813 * an array index, check if it is a lower bound, say m i >= b(x), and,
2814 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2815 * upper bound. If so, and if this bound is smaller than any bound
2816 * derived from earlier constraints, set the size to this bound on
2817 * the expression and the lower bound to ceil(b(x)/m).
2819 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2821 struct cuda_size_info *size = user;
2822 unsigned nparam;
2823 unsigned n_div;
2824 isl_int v;
2826 nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2827 n_div = isl_constraint_dim(c, isl_dim_div);
2829 if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2830 isl_constraint_free(c);
2831 return 0;
2834 isl_int_init(v);
2836 isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2838 if (isl_int_is_pos(v)) {
2839 isl_aff *aff;
2840 isl_aff *lb;
2841 enum isl_lp_result res;
2843 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2844 aff = isl_aff_ceil(aff);
2846 lb = isl_aff_copy(aff);
2848 aff = isl_aff_neg(aff);
2849 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2851 res = isl_basic_set_max(size->bset, aff, &v);
2852 isl_aff_free(aff);
2854 if (res == isl_lp_ok) {
2855 isl_int_add_ui(v, v, 1);
2856 if (isl_int_is_neg(size->bound->size) ||
2857 isl_int_lt(v, size->bound->size)) {
2858 isl_int_set(size->bound->size, v);
2859 lb = isl_aff_drop_dims(lb, isl_dim_in,
2860 0, size->pos + 1);
2861 isl_aff_free(size->bound->lb);
2862 size->bound->lb = isl_aff_copy(lb);
2865 isl_aff_free(lb);
2868 isl_int_clear(v);
2869 isl_constraint_free(c);
2871 return 0;
2874 /* Given a basic map "bounds" that maps parameters and input dimensions
2875 * to a single output dimension, look for an expression in the parameters
2876 * and input dimensions such that the range of the output dimension shifted
2877 * by this expression is a constant.
2879 * In particular, we currently only consider lower bounds on the output
2880 * dimension as candidate expressions.
2882 static int compute_array_dim_size(struct cuda_gen *gen,
2883 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2885 struct cuda_size_info size;
2887 bounds = isl_basic_map_detect_equalities(bounds);
2888 bounds = check_stride(gen, bound, bounds);
2890 isl_int_set_si(bound->size, -1);
2891 bound->lb = NULL;
2893 size.bound = bound;
2894 size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2895 size.bset = isl_basic_map_wrap(bounds);
2896 size.bset = isl_basic_set_flatten(size.bset);
2897 size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2898 isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2899 &size);
2900 isl_basic_set_free(size.bset);
2902 return isl_int_is_nonneg(bound->size) ? 0 : -1;
2905 /* Check if we can find a shared memory tile for the given array
2906 * based on the given accesses, and if so, put the results
2907 * in array->shared_bound.
2909 * We project the accesses on each index in turn and look for a parametric
2910 * offset such that the size is constant.
2912 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2913 struct cuda_array_info *array, __isl_keep isl_map *access,
2914 struct cuda_array_bound *bounds)
2916 int i;
2918 for (i = 0; i < array->n_index; ++i) {
2919 isl_map *access_i;
2920 isl_basic_map *hull;
2922 access_i = isl_map_copy(access);
2923 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2924 access_i = isl_map_project_out(access_i, isl_dim_out,
2925 1, array->n_index - (i + 1));
2926 access_i = isl_map_compute_divs(access_i);
2927 hull = isl_map_simple_hull(access_i);
2928 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2929 return 0;
2932 return 1;
2935 /* Construct a map with input the shared tile loops and the loops that
2936 * will be wrapped around the threads that relates these later loops
2937 * to the thread indices and then projects them out.
2939 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2941 isl_map *priv;
2942 isl_map *tiling;
2943 isl_map *proj;
2944 isl_set *par;
2945 isl_space *dim;
2947 dim = isl_union_map_get_space(gen->shared_sched);
2949 if (gen->options->wrap)
2950 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2951 gen->shared_len, gen->n_block, gen->block_dim);
2952 else
2953 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2954 gen->shared_len, gen->n_block, gen->block_dim);
2956 priv = tiling;
2958 par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2959 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2960 gen->n_block, "t");
2962 priv = isl_map_align_params(priv, isl_set_get_space(par));
2963 priv = isl_map_intersect_range(priv, par);
2965 dim = isl_map_get_space(priv);
2966 dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2967 dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2968 proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2969 gen->shared_len);
2971 priv = isl_map_apply_range(priv, proj);
2973 return priv;
2976 /* Construct a map from domain_dim to domain_dim that increments
2977 * the dimension at position "pos" and leaves all other dimensions
2978 * constant.
2980 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2982 int i;
2983 int len = isl_space_dim(domain_dim, isl_dim_set);
2984 isl_space *dim;
2985 isl_basic_map *next;
2986 isl_local_space *ls;
2988 dim = isl_space_map_from_set(domain_dim);
2989 next = isl_basic_map_universe(isl_space_copy(dim));
2990 ls = isl_local_space_from_space(dim);
2992 for (i = 0; i < len; ++i) {
2993 isl_constraint *c;
2995 c = isl_equality_alloc(isl_local_space_copy(ls));
2996 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2997 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2998 if (i == pos)
2999 isl_constraint_set_constant_si(c, 1);
3000 next = isl_basic_map_add_constraint(next, c);
3003 isl_local_space_free(ls);
3005 return isl_map_from_basic_map(next);
3008 /* Check if the given access is coalesced.
3009 * That is, check whether incrementing the dimension that will get
3010 * wrapped over the last thread index results in incrementing
3011 * the last array index.
3013 * This function is only called for access relations without reuse.
3015 static int access_is_coalesced(struct cuda_gen *gen,
3016 __isl_keep isl_union_map *access)
3018 isl_space *dim;
3019 isl_map *access_map;
3020 isl_map *next_thread_x;
3021 isl_map *next_element;
3022 isl_map *map;
3023 int coalesced;
3025 access = isl_union_map_copy(access);
3026 access = isl_union_map_apply_domain(access,
3027 isl_union_map_copy(gen->tiled_sched));
3028 access_map = isl_map_from_union_map(access);
3030 dim = isl_map_get_space(access_map);
3031 dim = isl_space_domain(dim);
3032 next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
3034 dim = isl_map_get_space(access_map);
3035 dim = isl_space_range(dim);
3036 next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
3038 map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
3039 map = isl_map_apply_range(map, access_map);
3041 coalesced = isl_map_is_subset(map, next_element);
3043 isl_map_free(next_element);
3044 isl_map_free(map);
3046 return coalesced;
3049 /* For the given array reference group, check whether the access is private
3050 * to the thread. That is, check that any given array element
3051 * is only accessed by a single thread.
3052 * We compute an access relation that maps the shared tile loop iterators
3053 * and the shared point loop iterators that will be wrapped over the
3054 * threads to the array elements.
3055 * We actually check that those iterators that will be wrapped
3056 * partition the array space. This check is stricter than necessary
3057 * since several iterations may be mapped onto the same thread
3058 * and then they could be allowed to access the same memory elements,
3059 * but our check does not allow this situation.
3061 * We also check that the index expression only depends on parallel
3062 * loops. That way, we can move those loops innermost and unroll them.
3063 * Again, we use a test that is stricter than necessary.
3064 * We actually check whether the index expression only depends
3065 * on the iterators that are wrapped over the threads.
3066 * These are necessarily parallel, but there may be more parallel loops.
3068 * Combining the injectivity of the first test with the single-valuedness
3069 * of the second test, we simply test for bijectivity.
3071 * If it turns out we can use registers, we compute the private memory
3072 * tile size using can_tile_for_shared_memory, after introducing a dependence
3073 * on the thread indices.
3075 * Before performing any of the above computations, we first check
3076 * if there is any reuse on the reference group. If not, we simply
3077 * return. If, moreover, the access is coalesced then we also remove
3078 * the shared memory tiling since we should just use global memory instead.
3080 static void check_private_group_access(struct cuda_gen *gen,
3081 struct cuda_array_ref_group *group)
3083 isl_map *acc;
3084 isl_union_map *access;
3085 int n_index = group->array->n_index;
3087 access = group_access_relation(group, 1, 1);
3088 if (isl_union_map_is_injective(access)) {
3089 if (group->shared_bound && access_is_coalesced(gen, access)) {
3090 free_bound_list(group->shared_bound, n_index);
3091 group->shared_bound = NULL;
3093 isl_union_map_free(access);
3094 return;
3096 access = isl_union_map_apply_domain(access,
3097 isl_union_map_copy(gen->shared_sched));
3099 acc = isl_map_from_union_map(access);
3101 if (!isl_map_is_bijective(acc)) {
3102 isl_map_free(acc);
3103 return;
3106 group->private_bound = create_bound_list(gen->ctx, n_index);
3107 acc = isl_map_align_params(acc, isl_map_get_space(gen->privatization));
3108 acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
3109 if (!can_tile_for_shared_memory(gen, group->array, acc,
3110 group->private_bound)) {
3111 free_bound_list(group->private_bound, n_index);
3112 group->private_bound = NULL;
3115 isl_map_free(acc);
3118 /* Look for the last shared tile loop that affects the offset of the
3119 * shared or private tile and store the result in array->last_shared.
3121 static void set_last_shared(struct cuda_gen *gen,
3122 struct cuda_array_ref_group *group)
3124 int i, j;
3125 struct cuda_array_bound *bounds;
3126 unsigned first_shared = gen->first_shared;
3127 int n_index = group->array->n_index;
3129 bounds = group->private_bound;
3130 if (!bounds)
3131 bounds = group->shared_bound;
3132 if (!bounds)
3133 return;
3135 for (j = gen->shared_len - 1; j >= 0; --j) {
3136 for (i = 0; i < n_index; ++i) {
3137 isl_aff *lb;
3138 isl_aff *shift;
3140 lb = bounds[i].lb;
3141 if (isl_aff_involves_dims(lb, isl_dim_param,
3142 first_shared + j, 1))
3143 break;
3145 shift = bounds[i].shift;
3146 if (!shift)
3147 continue;
3148 if (isl_aff_involves_dims(shift, isl_dim_param,
3149 first_shared + j, 1))
3150 break;
3152 if (i < n_index)
3153 break;
3155 group->last_shared = j;
3158 /* Compute the sizes of all private arrays for the current kernel,
3159 * as well as the offsets of the private pieces in the original arrays.
3160 * If we cannot or don't want to privatize a given array group,
3161 * we use the shared memory tile sizes computed in
3162 * compute_group_shared_bound instead.
3164 * If we have been able to find a private or shared tile,
3165 * we also look for the last shared tile loop that affects the offset
3166 * (and therefore the group tile) and store the result in group->last_shared.
3168 * A privatized copy of all access relations from reference groups that
3169 * are mapped to private memory is stored in gen->privatization.
3171 static void compute_private_size(struct cuda_gen *gen)
3173 int i, j;
3174 isl_union_map *private;
3176 if (!gen->options->use_private_memory)
3177 return;
3179 private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
3181 for (i = 0; i < gen->n_array; ++i) {
3182 struct cuda_array_info *array = &gen->array[i];
3184 for (j = 0; j < array->n_group; ++j) {
3185 check_private_group_access(gen, array->groups[j]);
3187 if (!array->groups[j]->private_bound)
3188 continue;
3190 private = isl_union_map_union(private,
3191 group_access_relation(array->groups[j], 1, 1));
3194 for (j = 0; j < array->n_group; ++j) {
3195 array->groups[j]->last_shared = gen->shared_len - 1;
3196 array->groups[j]->print_shared_level = -1;
3197 set_last_shared(gen, array->groups[j]);
3201 if (isl_union_map_is_empty(private))
3202 isl_union_map_free(private);
3203 else {
3204 isl_union_map *priv;
3206 private = isl_union_map_apply_domain(private,
3207 isl_union_map_copy(gen->shared_sched));
3208 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3209 private = isl_union_map_apply_domain(private, priv);
3210 gen->private_access = private;
3214 /* Compute the size of the tile specified by the list "bound" of n_index
3215 * cuda_array_bounds in number of elements and put the result in *size.
3217 static void tile_size(unsigned n_index, struct cuda_array_bound *bound,
3218 isl_int *size)
3220 int i;
3222 isl_int_set_si(*size, 1);
3224 for (i = 0; i < n_index; ++i)
3225 isl_int_mul(*size, *size, bound[i].size);
3228 /* If max_shared_memory is not set to infinity (-1), then make
3229 * sure that the total amount of shared memory required by the
3230 * array reference groups mapped to shared memory is no larger
3231 * than this maximum.
3233 * We apply a greedy approach and discard (keep in global memory)
3234 * those groups that would result in a total memory size that
3235 * is larger than the maximum.
3237 static void check_shared_memory_bound(struct cuda_gen *gen)
3239 int i, j;
3240 isl_int left, size;
3242 if (gen->options->max_shared_memory < 0)
3243 return;
3245 isl_int_init(left);
3246 isl_int_init(size);
3247 isl_int_set_si(left, gen->options->max_shared_memory);
3249 for (i = 0; i < gen->n_array; ++i) {
3250 struct cuda_array_info *array = &gen->array[i];
3252 for (j = 0; j < array->n_group; ++j) {
3253 struct cuda_array_ref_group *group;
3255 group = array->groups[j];
3256 if (!group->shared_bound)
3257 continue;
3259 tile_size(array->n_index, group->shared_bound, &size);
3260 isl_int_mul_ui(size, size, array->size);
3262 if (isl_int_le(size, left)) {
3263 isl_int_sub(left, left, size);
3264 continue;
3267 free_bound_list(group->shared_bound, array->n_index);
3268 group->shared_bound = NULL;
3272 isl_int_clear(size);
3273 isl_int_clear(left);
3276 /* Fill up the groups array with singleton groups, i.e., one group
3277 * per reference, initializing the array, access, write and refs fields.
3278 * In particular the access field is initialized to the scheduled
3279 * access relation of the array reference.
3281 * Return the number of elements initialized, i.e., the number of
3282 * active references in the current kernel.
3284 static int populate_array_references(struct cuda_gen *gen,
3285 struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3286 struct cuda_array_ref_group **groups)
3288 int i;
3289 int n;
3290 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3292 n = 0;
3293 for (i = 0; i < array->n_ref; ++i) {
3294 isl_union_map *umap;
3295 isl_map *map;
3296 struct cuda_array_ref_group *group;
3297 struct cuda_stmt_access *access = array->refs[i];
3299 map = isl_map_copy(access->access);
3300 umap = isl_union_map_from_map(map);
3301 umap = isl_union_map_apply_domain(umap,
3302 isl_union_map_copy(sched));
3304 if (isl_union_map_is_empty(umap)) {
3305 isl_union_map_free(umap);
3306 continue;
3309 map = isl_map_from_union_map(umap);
3310 map = isl_map_detect_equalities(map);
3312 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3313 assert(group);
3314 group->array = array;
3315 group->access = map;
3316 group->write = access->write;
3317 group->refs = &array->refs[i];
3319 groups[n++] = group;
3322 return n;
3325 static void free_array_ref_group(struct cuda_array_ref_group *group,
3326 int n_index)
3328 if (!group)
3329 return;
3330 free_bound_list(group->shared_bound, n_index);
3331 free_bound_list(group->private_bound, n_index);
3332 isl_map_free(group->access);
3333 free(group->refs);
3334 free(group);
3337 /* Given a set where the parameters gen->first_shared up to
3338 * gen->first_shared + gen->shared_len represent the tile loops,
3339 * eliminate the innermost of those that have a fixed value
3340 * until we reach one that does not (obviously) have a fixed value.
3342 static __isl_give isl_set *eliminate_fixed_inner_loops(struct cuda_gen *gen,
3343 __isl_take isl_set *access)
3345 int i;
3347 for (i = gen->shared_len - 1; i >= 0; --i) {
3348 int pos = gen->first_shared + i;
3349 if (!isl_set_plain_is_fixed(access, isl_dim_param, pos, NULL))
3350 break;
3351 access = isl_set_eliminate(access, isl_dim_param, pos, 1);
3353 return access;
3356 /* Check if the accessed set of group1 and group2 overlap within
3357 * the innermost loop. In particular, ignore any inner dimension
3358 * with a fixed value.
3359 * The copying to and from shared memory will be performed within
3360 * the innermost actual loop so we are only allowed to consider
3361 * the dimensions up to that innermost loop while checking whether
3362 * two access sets overlap.
3364 static int accesses_overlap(struct cuda_gen *gen,
3365 struct cuda_array_ref_group *group1,
3366 struct cuda_array_ref_group *group2)
3368 int empty;
3369 isl_set *access1, *access2;
3371 access1 = isl_map_range(isl_map_copy(group1->access));
3372 access1 = eliminate_fixed_inner_loops(gen, access1);
3373 access2 = isl_map_range(isl_map_copy(group2->access));
3374 access2 = eliminate_fixed_inner_loops(gen, access2);
3375 access1 = isl_set_intersect(access1, access2);
3376 empty = isl_set_is_empty(access1);
3377 isl_set_free(access1);
3379 return !empty;
3382 /* If two groups have overlapping access relations (within the innermost
3383 * loop) and if one of them involves a write, then merge the two groups
3384 * into one.
3386 * We keep track of the grouping in "leader". leader[j] points to
3387 * an earlier group array element that belongs to the same group,
3388 * or the array element j itself if this element is the first in the group.
3390 * Return the number of group leaders.
3392 static int group_overlapping_writes(struct cuda_gen *gen, int n,
3393 struct cuda_array_ref_group **groups, int *leader)
3395 int i, j;
3396 int n_group = n;
3398 for (i = 0; i < n; ++i) {
3399 int l = i;
3400 groups[l]->n_ref = 1;
3401 for (j = i - 1; j >= 0; --j) {
3402 if (leader[j] != j)
3403 continue;
3404 if (!groups[l]->write && !groups[j]->write)
3405 continue;
3407 if (!accesses_overlap(gen, groups[l], groups[j]))
3408 continue;
3410 groups[j]->access = isl_map_union(groups[j]->access,
3411 groups[l]->access);
3412 groups[j]->write = 1;
3413 groups[l]->access = NULL;
3414 groups[j]->n_ref += groups[l]->n_ref;
3415 l = leader[l] = j;
3416 n_group--;
3418 leader[i] = l;
3421 return n_group;
3424 /* Compute the size of the shared array corresponding to the given array
3425 * array refrence group, based on the accesses from the current kernel,
3426 * as well as the offset of the shared piece in the original array.
3428 static void compute_group_shared_bound(struct cuda_gen *gen,
3429 struct cuda_array_info *array, struct cuda_array_ref_group *group)
3431 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3433 if (!gen->options->use_shared_memory)
3434 return;
3435 if (cuda_array_is_read_only_scalar(array))
3436 return;
3438 group->shared_bound = create_bound_list(ctx, array->n_index);
3439 if (!can_tile_for_shared_memory(gen, array, group->access,
3440 group->shared_bound)) {
3441 free_bound_list(group->shared_bound, array->n_index);
3442 group->shared_bound = NULL;
3446 /* Is the size of the tile specified by "bound" smaller than the sum of
3447 * the sizes of the tiles specified by "bound1" and "bound2"?
3449 static int smaller_tile(unsigned n_index, struct cuda_array_bound *bound,
3450 struct cuda_array_bound *bound1, struct cuda_array_bound *bound2)
3452 int smaller;
3453 isl_int size, size1, size2;
3455 isl_int_init(size);
3456 isl_int_init(size1);
3457 isl_int_init(size2);
3459 tile_size(n_index, bound, &size);
3460 tile_size(n_index, bound1, &size1);
3461 tile_size(n_index, bound2, &size2);
3463 isl_int_sub(size, size, size1);
3464 isl_int_sub(size, size, size2);
3465 smaller = isl_int_is_neg(size);
3467 isl_int_clear(size2);
3468 isl_int_clear(size1);
3469 isl_int_clear(size);
3471 return smaller;
3474 /* Given an initial grouping of array references and shared memory tiles
3475 * for each group that allows for a shared memory tile, merge two groups
3476 * if both have a shared memory tile, the merged group also has
3477 * a shared memory tile and the size of the tile for the merge group
3478 * is smaller than the sum of the tile sizes of the individual groups.
3480 * Return the number of group leaders after merging.
3482 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3483 struct cuda_array_info *array, int n,
3484 struct cuda_array_ref_group **groups, int *leader, int n_group)
3486 int i, j;
3487 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3489 for (i = 0; n_group > 1 && i < n; ++i) {
3490 int l = i;
3491 if (leader[i] != i)
3492 continue;
3493 if (!groups[i]->shared_bound)
3494 continue;
3495 for (j = i - 1; j >= 0; --j) {
3496 isl_map *map;
3497 int empty;
3498 struct cuda_array_bound *shared_bound;
3500 if (leader[j] != j)
3501 continue;
3502 if (!groups[j]->shared_bound)
3503 continue;
3505 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3506 isl_map_copy(groups[j]->access));
3507 empty = isl_map_is_empty(map);
3508 isl_map_free(map);
3510 if (empty)
3511 continue;
3513 map = isl_map_union(isl_map_copy(groups[l]->access),
3514 isl_map_copy(groups[j]->access));
3515 shared_bound = create_bound_list(ctx, array->n_index);
3516 if (!can_tile_for_shared_memory(gen, array, map,
3517 shared_bound) ||
3518 !smaller_tile(array->n_index, shared_bound,
3519 groups[l]->shared_bound,
3520 groups[j]->shared_bound)) {
3521 isl_map_free(map);
3522 free_bound_list(shared_bound, array->n_index);
3523 continue;
3526 free_bound_list(groups[j]->shared_bound,
3527 array->n_index);
3528 groups[j]->shared_bound = shared_bound;
3529 isl_map_free(groups[j]->access);
3530 groups[j]->access = map;
3531 groups[j]->n_ref += groups[l]->n_ref;
3532 l = leader[l] = j;
3533 n_group--;
3537 return n_group;
3540 /* Extract an array of array reference groups from the array of references
3541 * and the grouping information in "leader".
3543 * Store the results in array->n_group and array->groups.
3545 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3546 int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3548 int i, j;
3550 for (i = 2; i < n; ++i)
3551 leader[i] = leader[leader[i]];
3553 array->n_group = n_group;
3554 array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3555 n_group);
3556 assert(array->groups);
3558 j = 0;
3559 for (i = 0; i < n; ++i) {
3560 int k, l;
3561 struct cuda_stmt_access **refs;
3563 if (leader[i] != i) {
3564 groups[i]->refs = NULL;
3565 free_array_ref_group(groups[i], array->n_index);
3566 continue;
3569 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3570 groups[i]->n_ref);
3571 assert(refs);
3572 l = 0;
3573 for (k = i; k < n; ++k)
3574 if (leader[k] == i) {
3575 refs[l++] = *groups[k]->refs;
3576 (*groups[k]->refs)->group = j;
3579 groups[i]->refs = refs;
3580 groups[i]->nr = j;
3581 array->groups[j++] = groups[i];
3585 /* Group array references that should be considered together when
3586 * deciding whether to access them from private, shared or global memory.
3588 * In particular, if two array references overlap and if one of them
3589 * is a write, then the two references are grouped together.
3590 * Furthermore, if two groups admit a shared memory tile and if the
3591 * combination of the two also admits a shared memory tile, we merge
3592 * the two groups.
3594 * During the construction the group->refs field points to a single
3595 * array reference inside the array of array references, while
3596 * group->n_ref contains the number of element in leader that
3597 * (directly or indirectly) point to this group, provided the group
3598 * is a leader.
3600 static void group_array_references(struct cuda_gen *gen,
3601 struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3603 int i;
3604 int n, n_group;
3605 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3606 struct cuda_array_ref_group **groups;
3607 int *leader;
3609 groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3610 array->n_ref);
3611 assert(groups);
3613 n = populate_array_references(gen, array, sched, groups);
3615 leader = isl_alloc_array(ctx, int, n);
3616 assert(leader);
3618 n_group = group_overlapping_writes(gen, n, groups, leader);
3620 for (i = 0; i < n; ++i)
3621 if (leader[i] == i)
3622 compute_group_shared_bound(gen, array, groups[i]);
3624 n_group = group_common_shared_memory_tile(gen, array, n, groups,
3625 leader, n_group);
3627 extract_array_groups(ctx, array, n, groups, leader, n_group);
3629 free(leader);
3630 free(groups);
3633 /* Take tiled_sched, project it onto the shared tile loops and
3634 * the loops that will be wrapped over the threads,
3635 * parametrize the shared tile loops and store the result in gen->shared_sched.
3636 * The position of the first of these parameters is stored in gen->first_shared.
3637 * Also compute a projection that projects out the loops that will be
3638 * wrapped over the threads and store this projection in gen->shared_proj.
3640 static void compute_shared_sched(struct cuda_gen *gen)
3642 isl_space *dim;
3643 isl_map *proj;
3644 isl_set *par;
3645 isl_union_map *sched;
3647 sched = isl_union_map_copy(gen->tiled_sched);
3649 dim = isl_union_map_get_space(sched);
3650 gen->first_shared = isl_space_dim(dim, isl_dim_param);
3651 proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3652 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3654 dim = isl_union_map_get_space(sched);
3655 par = parametrization(dim, gen->shared_len + gen->n_block,
3656 0, gen->shared_len, "g");
3657 sched = isl_union_map_intersect_range(sched,
3658 isl_union_set_from_set(par));
3660 dim = isl_union_map_get_space(sched);
3661 proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3663 gen->shared_sched = sched;
3664 gen->shared_proj = isl_union_map_from_map(proj);
3667 /* Group references of all arrays in the program.
3669 static void group_references(struct cuda_gen *gen)
3671 int i;
3672 isl_union_map *sched;
3674 sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3675 isl_union_map_copy(gen->shared_proj));
3677 for (i = 0; i < gen->n_array; ++i)
3678 group_array_references(gen, &gen->array[i], sched);
3680 isl_union_map_free(sched);
3683 /* Free all array information that is local to the current kernel.
3685 static void free_local_array_info(struct cuda_gen *gen)
3687 int i, j;
3689 for (i = 0; i < gen->n_array; ++i) {
3690 struct cuda_array_info *array = &gen->array[i];
3692 for (j = 0; j < array->n_group; ++j)
3693 free_array_ref_group(array->groups[j], array->n_index);
3694 free(array->groups);
3696 if (array->n_group == 0)
3697 continue;
3698 for (j = 0; j < gen->array[i].n_index; ++j) {
3699 isl_pw_aff_free(gen->array[i].local_bound[j]);
3700 gen->array[i].local_bound[j] = NULL;
3705 /* The sizes of the arrays on the host that have been computed by
3706 * extract_array_info may depend on the parameters. Use the extra
3707 * constraints on the parameters that are valid at "host_domain"
3708 * to simplify these expressions.
3710 static void localize_bounds(struct cuda_gen *gen,
3711 __isl_keep isl_set *host_domain)
3713 int i, j;
3714 isl_set *context;
3716 context = isl_set_copy(host_domain);
3717 context = isl_set_params(host_domain);
3719 for (i = 0; i < gen->n_array; ++i) {
3720 struct cuda_array_info *array = &gen->array[i];
3722 if (array->n_group == 0)
3723 continue;
3725 for (j = 0; j < array->n_index; ++j) {
3726 isl_pw_aff *pwaff;
3728 pwaff = isl_pw_aff_copy(array->bound[j]);
3729 pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3730 array->local_bound[j] = pwaff;
3733 isl_set_free(context);
3736 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3737 * in the statement list u.
3738 * Because of the way the schedule is constructed, the other statements
3739 * in the list, if any, should have the same values for these properties.
3741 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3743 int nr;
3744 struct cuda_stmt *stmt;
3746 nr = atoi(u->statement->name + 2);
3747 stmt = &gen->stmts[nr];
3749 gen->tile_len = stmt->tile_len;
3750 gen->n_parallel = stmt->n_parallel;
3753 /* Extract a description of the grid, i.e., the possible values
3754 * of the block ids, from gen->tiled_sched.
3755 * The block ids are parameters in gen->tiled_sched.
3756 * We simply need to change them into set dimensions.
3758 static __isl_give isl_set *extract_grid(struct cuda_gen *gen)
3760 int i;
3761 isl_set *grid;
3763 grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3764 grid = isl_set_from_params(grid);
3765 grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3766 for (i = 0; i < gen->n_grid; ++i) {
3767 int pos;
3768 char name[20];
3770 snprintf(name, sizeof(name), "b%d", i);
3771 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3772 assert(pos >= 0);
3773 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3774 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3777 return grid;
3780 /* Print the effective grid size as a list of the sizes in each
3781 * dimension, from innermost to outermost.
3783 * The grid size specified by the user or set by default
3784 * in read_grid_sizes() and applied in tile_schedule(),
3785 * may be too large for the given code in the sense that
3786 * it may contain blocks that don't need to execute anything.
3787 * We therefore don't print this grid size, but instead the
3788 * smallest grid size that ensures that all blocks that actually
3789 * execute code are included in the grid.
3791 * For each block dimension, we compute the maximal value of the block id
3792 * and add one.
3794 static void print_grid_size(struct cuda_gen *gen, __isl_take isl_set *context)
3796 int i;
3797 isl_printer *prn;
3798 isl_set *grid;
3800 if (gen->n_grid == 0) {
3801 isl_set_free(context);
3802 return;
3805 grid = extract_grid(gen);
3807 prn = isl_printer_to_file(gen->ctx, gen->cuda.host_c);
3808 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3810 prn = isl_printer_print_str(prn, "(");
3811 for (i = gen->n_grid - 1; i >= 0; --i) {
3812 isl_space *space;
3813 isl_aff *one;
3814 isl_pw_aff *bound = isl_set_dim_max(isl_set_copy(grid), i);
3816 bound = isl_pw_aff_coalesce(bound);
3817 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3819 space = isl_pw_aff_get_domain_space(bound);
3820 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3821 one = isl_aff_add_constant_si(one, 1);
3822 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3823 prn = isl_printer_print_pw_aff(prn, bound);
3824 isl_pw_aff_free(bound);
3826 if (i > 0)
3827 prn = isl_printer_print_str(prn, ", ");
3829 prn = isl_printer_print_str(prn, ")");
3831 isl_printer_free(prn);
3832 isl_set_free(grid);
3833 isl_set_free(context);
3836 /* This function is called for each leaf in the clast of the host code.
3837 * We first specialize the schedule to the site of the leaf, compute
3838 * the size of shared memory and then print the body of host code
3839 * and the associated kernel (through a call to print_kernel_body).
3841 static void print_host_user(struct clast_printer_info *code,
3842 struct clast_user_stmt *u)
3844 struct cuda_gen *gen = code->user;
3845 isl_space *dim;
3846 isl_set *par;
3847 isl_set *host_domain;
3848 isl_union_map *access;
3849 isl_union_map *local_sched;
3850 isl_union_set *arrays;
3852 set_tile_len(gen, u);
3853 read_sizes(gen);
3855 host_domain = extract_entire_host_domain(&u->stmt);
3857 local_sched = isl_union_map_intersect_range(
3858 isl_union_map_copy(gen->sched),
3859 isl_union_set_from_set(extend(isl_set_copy(host_domain),
3860 gen->untiled_len)));
3861 access = isl_union_map_union(isl_union_map_copy(gen->read),
3862 isl_union_map_copy(gen->write));
3863 access = isl_union_map_apply_domain(access,
3864 isl_union_map_copy(local_sched));
3865 arrays = isl_union_map_range(access);
3867 print_indent(code->dst, code->indent);
3868 fprintf(code->dst, "dim3 k%d_dimBlock", gen->kernel_id);
3869 print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3870 fprintf(code->dst, ";\n");
3872 gen->tiled_sched = tile_schedule(gen, local_sched);
3873 gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3874 gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3876 print_indent(code->dst, code->indent);
3877 fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
3878 print_grid_size(gen, isl_set_params(isl_set_copy(host_domain)));
3879 fprintf(code->dst, ";\n");
3881 gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3883 dim = isl_union_map_get_space(gen->local_sched);
3884 par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3885 gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3886 isl_union_set_from_set(par));
3888 gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3889 gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3891 gen->private_access = NULL;
3892 compute_shared_sched(gen);
3893 gen->privatization = compute_privatization(gen);
3894 group_references(gen);
3895 compute_private_size(gen);
3896 check_shared_memory_bound(gen);
3897 localize_bounds(gen, host_domain);
3899 gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3901 print_kernel_launch(gen, arrays);
3903 fprintf(gen->cuda.kernel_c, "{\n");
3905 print_kernel_body(gen, host_domain, gen->tiled_sched);
3907 fprintf(gen->cuda.kernel_c, "}\n");
3909 free_local_array_info(gen);
3910 isl_map_free(gen->privatization);
3911 isl_union_map_free(gen->private_access);
3912 isl_union_map_free(gen->local_sched);
3913 isl_union_map_free(gen->tiled_sched);
3914 isl_union_map_free(gen->shared_sched);
3915 isl_union_map_free(gen->shared_proj);
3916 isl_union_set_free(arrays);
3917 isl_set_free(host_domain);
3919 free(gen->tile_size);
3920 gen->kernel_id++;
3923 /* Use CLooG to generate code for the outer gen->tile_first loops
3924 * of the global schedule in gen->sched.
3925 * The pretty printing of this code is handled by print_clast,
3926 * which calls print_host_user for each kernel invocation location.
3928 static void print_cloog_host_code(struct cuda_gen *gen)
3930 int i;
3931 isl_set *context;
3932 isl_union_map *sched;
3933 CloogOptions *options;
3934 CloogDomain *cloog_context;
3935 CloogUnionDomain *ud;
3936 CloogInput *input;
3937 struct clast_stmt *stmt;
3938 char name[20];
3940 options = cloog_options_malloc(gen->state);
3941 options->language = CLOOG_LANGUAGE_C;
3942 options->otl = 0;
3943 options->strides = 1;
3944 options->stop = gen->tile_first;
3945 options->f = gen->untiled_len;
3946 options->l = gen->untiled_len;
3947 options->save_domains = 1;
3948 options->noscalars = 1;
3950 sched = isl_union_map_copy(gen->sched);
3951 ud = cloog_union_domain_from_isl_union_map(sched);
3952 for (i = 0; i < options->stop; ++i) {
3953 snprintf(name, sizeof(name), "h%d", i);
3954 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3956 context = isl_set_copy(gen->context);
3957 cloog_context = cloog_domain_from_isl_set(context);
3958 input = cloog_input_alloc(cloog_context, ud);
3960 stmt = cloog_clast_create_from_input(input, options);
3962 gen->code.indent = 0;
3963 gen->code.dst = gen->cuda.host_c;
3964 gen->code.print_user_stmt = NULL;
3965 gen->code.print_user_stmt_list = &print_host_user;
3966 gen->code.print_for_head = NULL;
3967 gen->code.print_for_foot = NULL;
3968 gen->code.user = gen;
3969 print_clast(&gen->code, stmt);
3971 cloog_clast_free(stmt);
3972 cloog_options_free(options);
3973 fprintf(gen->cuda.host_c, "\n");
3976 void print_cuda_macros(struct cuda_gen *gen)
3978 const char *macros =
3979 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
3980 "#define cudaCheckKernel()"
3981 " assert(cudaGetLastError() == cudaSuccess)\n\n";
3982 fputs(macros, gen->cuda.host_c);
3985 void print_host_code(struct cuda_gen *gen)
3987 fprintf(gen->cuda.host_c, "{\n");
3988 print_cloog_macros(gen->cuda.host_c);
3989 print_cloog_macros(gen->cuda.kernel_c);
3991 print_cuda_macros(gen);
3993 declare_device_arrays(gen);
3995 allocate_device_arrays(gen);
3996 copy_arrays_to_device(gen);
3998 gen->kernel_id = 0;
3999 print_cloog_host_code(gen);
4001 copy_arrays_from_device(gen);
4002 free_device_arrays(gen);
4004 fprintf(gen->cuda.host_c, "}\n");
4007 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
4008 const char *str)
4010 isl_ctx *ctx;
4011 isl_set *context;
4013 if (!str)
4014 return set;
4016 ctx = isl_set_get_ctx(set);
4017 context = isl_set_read_from_str(ctx, str);
4018 context = isl_set_align_params(context, isl_set_get_space(set));
4019 set = isl_set_intersect(set, context);
4021 return set;
4024 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4026 if (!str)
4027 return NULL;
4028 return isl_union_map_read_from_str(ctx, str);
4031 /* Return the union of all iteration domains of the gen->stmts[i].
4033 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
4035 int i;
4036 isl_union_set *domain;
4038 domain = isl_union_set_empty(isl_set_get_space(gen->context));
4039 for (i = 0; i < gen->n_stmts; ++i) {
4040 isl_set *domain_i;
4042 domain_i = isl_set_copy(gen->stmts[i].domain);
4043 domain = isl_union_set_union(domain,
4044 isl_union_set_from_set(domain_i));
4047 return domain;
4050 /* Information about the outermost tilable bands in the forest of bands.
4052 * tile_len and n_parallel are only sets on band_info structures
4053 * that correspond to outermost bands. For other bands (in particular,
4054 * ancestors of the outermost bands), n_parallal is set to 0.
4056 * prefix is the (padded) schedule leading up to the outermost tilable bands.
4058 * tile_first is the number of schedule dimensions in prefix.
4060 * suffix is the schedule of the outermost tilable bands and their descendants.
4062 struct band_info {
4063 struct cuda_gen *gen;
4064 int tile_first;
4065 int tile_len;
4066 int n_parallel;
4067 isl_union_map *prefix;
4068 isl_union_map *suffix;
4071 /* Set tile_len and n_parallel of the statement to that of
4072 * their outermost band, recorded in the band_info.
4074 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4076 struct band_info *info = user;
4077 int nr;
4078 struct cuda_stmt *stmt;
4080 nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
4081 stmt = &info->gen->stmts[nr];
4083 stmt->tile_len = info->tile_len;
4084 stmt->n_parallel = info->n_parallel;
4086 isl_map_free(map);
4088 return 0;
4091 static void list_select_outer_band(struct cuda_gen *gen,
4092 __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4094 /* Check if this band has any parallel loops. If so, take it as
4095 * the outermost tilable band. If not, continue looking for the
4096 * outermost tilable band in the children of the current band.
4098 static void band_select_outer_band(struct cuda_gen *gen,
4099 __isl_take isl_band *band, int pos, struct band_info *info)
4101 int n = isl_band_n_member(band);
4102 int n_parallel;
4104 for (n_parallel = 0; n_parallel < n; ++n_parallel)
4105 if (!isl_band_member_is_zero_distance(band, n_parallel))
4106 break;
4108 info->n_parallel = n_parallel;
4109 if (n_parallel) {
4110 info->gen = gen;
4111 info->tile_first = pos;
4112 info->tile_len = n;
4113 info->prefix = isl_band_get_prefix_schedule(band);
4114 info->suffix = isl_union_map_flat_range_product(
4115 isl_band_get_partial_schedule(band),
4116 isl_band_get_suffix_schedule(band));
4117 isl_union_map_foreach_map(info->prefix,
4118 &set_stmt_tile_len, info);
4119 } else if (isl_band_has_children(band)) {
4120 isl_band_list *children;
4121 children = isl_band_get_children(band);
4122 list_select_outer_band(gen, children, pos + n, info);
4123 } else {
4124 info->gen = gen;
4125 info->tile_first = pos + n;
4126 info->tile_len = 0;
4127 info->prefix = isl_union_map_flat_range_product(
4128 isl_band_get_prefix_schedule(band),
4129 isl_band_get_partial_schedule(band));
4130 info->suffix = isl_band_get_suffix_schedule(band);
4131 isl_union_map_foreach_map(info->prefix,
4132 &set_stmt_tile_len, info);
4135 isl_band_free(band);
4138 /* Comparison function that returns a non-zero value for band_infos
4139 * with different tile_len fields or different n_parallel fields.
4141 static int cmp_band(const void *p1, const void *p2)
4143 const struct band_info *info1 = p1;
4144 const struct band_info *info2 = p2;
4146 if (info1->tile_len != info2->tile_len)
4147 return info1->tile_len - info2->tile_len;
4149 return info1->n_parallel - info2->n_parallel;
4152 /* Extend "umap" with coordinates with fixed value "val"
4153 * to a total length of "dst_len", assuming the original dimension is "src_len".
4155 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
4156 int src_len, int dst_len, int val)
4158 isl_space *dim;
4159 isl_map *map;
4160 int i;
4162 dim = isl_union_map_get_space(umap);
4163 map = isl_map_reverse(projection(dim, dst_len, src_len));
4164 for (i = src_len; i < dst_len; ++i)
4165 map = isl_map_fix_si(map, isl_dim_out, i, val);
4167 umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4169 return umap;
4172 /* Group bands with the same values for tile_len and n_parallel.
4173 * The prefix schedule is then extended with a fixed coordinate that
4174 * is different for each such group.
4175 * Note that the actual values for this coordinate are not important.
4176 * The bands have already been effectively separated at a higher level
4177 * or they are independent and may be executed in parallel.
4178 * The list of band_info has been sorted before this functions is called.
4180 static void separate_bands(struct band_info *info, int n)
4182 int i;
4183 int j = 0;
4185 for (i = 0; i < n; ++i) {
4186 int l = info[i].tile_first;
4188 if (i &&
4189 (info[i].tile_len != info[i - 1].tile_len ||
4190 info[i].n_parallel != info[i - 1].n_parallel))
4191 j++;
4193 info[i].prefix = extend_range(info[i].prefix,
4194 l, l + 1, j);
4195 info[i].tile_first = l + 1;
4199 /* Select the outermost bands in the elements of the list, align
4200 * their prefix schedules, separate bands with different values
4201 * for tile_len and/or n_parallel and then combine the resulting
4202 * prefix and suffix schedules into a single pair of prefix and
4203 * suffix schedules for the entire list.
4205 static void list_select_outer_band(struct cuda_gen *gen,
4206 __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4208 isl_band *band;
4209 int i;
4210 int n = isl_band_list_n_band(list);
4211 isl_ctx *ctx = isl_band_list_get_ctx(list);
4212 struct band_info *info;
4213 int max_tile_first;
4214 isl_union_map *prefix;
4215 isl_union_map *suffix;
4217 assert(n >= 1);
4218 info = isl_calloc_array(ctx, struct band_info, n);
4219 assert(info);
4221 max_tile_first = 0;
4222 for (i = 0; i < n; ++i) {
4223 band = isl_band_list_get_band(list, i);
4224 band_select_outer_band(gen, band, pos, &info[i]);
4225 if (info[i].tile_first > max_tile_first)
4226 max_tile_first = info[i].tile_first;
4229 for (i = 0; i < n; ++i) {
4230 if (info[i].tile_first == max_tile_first)
4231 continue;
4232 info[i].prefix = extend_range(info[i].prefix,
4233 info[i].tile_first, max_tile_first, 0);
4234 info[i].tile_first = max_tile_first;
4237 qsort(info, n, sizeof(struct band_info), &cmp_band);
4239 for (i = 0; i < n - 1; ++i)
4240 if (info[i].tile_len != info[i + 1].tile_len ||
4241 info[i].n_parallel != info[i + 1].n_parallel)
4242 break;
4244 if (i < n -1)
4245 separate_bands(info, n);
4247 prefix = info[0].prefix;
4248 suffix = info[0].suffix;
4250 for (i = 1; i < n; ++i) {
4251 prefix = isl_union_map_union(prefix, info[i].prefix);
4252 suffix = isl_union_map_union(suffix, info[i].suffix);
4255 list_info->tile_first = info[0].tile_first;
4256 list_info->tile_len = -1;
4257 list_info->prefix = prefix;
4258 list_info->suffix = suffix;
4260 isl_band_list_free(list);
4261 free(info);
4264 /* Set max_out to the maximal number of output dimensions over
4265 * all maps.
4267 static int update_max_out(__isl_take isl_map *map, void *user)
4269 int *max_out = user;
4270 int n_out = isl_map_dim(map, isl_dim_out);
4272 if (n_out > *max_out)
4273 *max_out = n_out;
4275 isl_map_free(map);
4276 return 0;
4279 struct align_range_data {
4280 int max_out;
4281 isl_union_map *res;
4284 /* Extend the dimension of the range of the given map to data->max_out and
4285 * then add the result to data->res.
4287 static int map_align_range(__isl_take isl_map *map, void *user)
4289 struct align_range_data *data = user;
4290 int i;
4291 isl_space *dim;
4292 isl_map *proj;
4293 int n_out = isl_map_dim(map, isl_dim_out);
4295 dim = isl_union_map_get_space(data->res);
4296 proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4297 for (i = n_out; i < data->max_out; ++i)
4298 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4300 map = isl_map_apply_range(map, proj);
4302 data->res = isl_union_map_add_map(data->res, map);
4304 return 0;
4307 /* Extend the ranges of the maps in the union map such they all have
4308 * the same dimension.
4310 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4312 struct align_range_data data;
4314 data.max_out = 0;
4315 isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4317 data.res = isl_union_map_empty(isl_union_map_get_space(umap));
4318 isl_union_map_foreach_map(umap, &map_align_range, &data);
4320 isl_union_map_free(umap);
4321 return data.res;
4324 /* Select the outermost tilable band that (by construction)
4325 * has at least one parallel loop.
4326 * The starting position of the aligned band is stored in the pair
4327 * gen->tile_first.
4328 * The sizes and number of parallel loops may be different in different
4329 * parts of the band forest and are therefore stored in the cuda_stmts.
4331 * Return the complete schedule, with the tilable bands aligned
4332 * at gen->tile_first and padded with zero, if needed.
4334 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4335 __isl_keep isl_schedule *schedule)
4337 isl_band_list *list;
4338 struct band_info info;
4340 gen->n_parallel = 0;
4341 gen->tile_len = -1;
4343 list = isl_schedule_get_band_forest(schedule);
4345 list_select_outer_band(gen, list, 0, &info);
4347 gen->tile_first = info.tile_first;
4348 info.suffix = align_range(info.suffix);
4350 return isl_union_map_flat_range_product(info.prefix, info.suffix);
4353 /* Set gen->untiled_len to the number of scheduling dimensions
4354 * for the schedule of the first domain.
4355 * We assume here that this number is the same for all domains.
4357 static int set_untiled_len(__isl_take isl_map *map, void *user)
4359 unsigned *untiled_len = user;
4361 *untiled_len = isl_map_dim(map, isl_dim_out);
4363 isl_map_free(map);
4364 return -1;
4367 /* Compute an appropriate schedule based on the accesses in
4368 * gen->read and gen->write.
4370 * We first compute dependences and then use those to compute
4371 * a schedule that has a parallel loop in each tilable band.
4372 * Finally, we select the outermost tilable band.
4374 static void compute_schedule(struct cuda_gen *gen,
4375 __isl_take isl_union_map *sched)
4377 isl_union_set *domain;
4378 isl_union_map *empty;
4379 isl_union_map *dep_raw, *dep2, *dep3, *dep;
4380 isl_union_map *uninitialized;
4381 isl_schedule *schedule;
4383 empty = isl_union_map_empty(isl_union_map_get_space(sched));
4385 isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4386 isl_union_map_copy(gen->write), empty,
4387 isl_union_map_copy(sched),
4388 &dep_raw, NULL, &uninitialized, NULL);
4389 isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4390 isl_union_map_copy(gen->write),
4391 isl_union_map_copy(gen->read),
4392 isl_union_map_copy(sched),
4393 &dep2, &dep3, NULL, NULL);
4394 isl_union_map_free(sched);
4396 gen->copy_in = isl_union_map_range(uninitialized);
4398 dep = isl_union_map_union(dep2, dep3);
4399 dep = isl_union_map_union(dep, dep_raw);
4400 dep = isl_union_map_coalesce(dep);
4402 domain = extract_domain(gen);
4403 schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4404 isl_union_map_copy(dep), dep);
4406 sched = select_outer_tilable_band(gen, schedule);
4408 isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4409 sched = isl_union_map_intersect_domain(sched, domain);
4410 gen->sched = sched;
4412 isl_schedule_free(schedule);
4415 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4416 struct cuda_stmt_access **next_access)
4418 struct cuda_stmt_access *access;
4419 isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4421 access = isl_alloc_type(ctx, struct cuda_stmt_access);
4422 assert(access);
4423 access->next = NULL;
4424 access->read = expr->acc.read;
4425 access->write = expr->acc.write;
4426 access->access = isl_map_copy(expr->acc.access);
4428 *next_access = access;
4429 next_access = &(*next_access)->next;
4430 return next_access;
4433 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4434 struct cuda_stmt_access **next_access)
4436 int i;
4438 for (i = 0; i < expr->n_arg; ++i)
4439 next_access = expr_extract_accesses(expr->args[i],
4440 next_access);
4442 if (expr->type == pet_expr_access)
4443 next_access = expr_extract_access(expr, next_access);
4445 return next_access;
4448 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4450 struct cuda_stmt_access **next_access = &stmt->accesses;
4452 stmt->accesses = NULL;
4453 expr_extract_accesses(stmt->body, next_access);
4456 /* Return an array of cuda_stmt representing the statements in "scop".
4458 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4459 __isl_keep isl_set *context)
4461 int i;
4462 struct cuda_stmt *stmts;
4464 stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4465 assert(stmts);
4467 for (i = 0; i < scop->n_stmt; ++i) {
4468 struct cuda_stmt *s = &stmts[i];
4470 s->domain = isl_set_copy(scop->stmts[i]->domain);
4471 s->domain = isl_set_intersect_params(s->domain,
4472 isl_set_copy(context));
4473 s->body = scop->stmts[i]->body;
4474 pet_stmt_extract_accesses(s);
4477 return stmts;
4480 /* Replace the scop in the "input" file by equivalent code
4481 * that uses the GPU. "scop" is assumed to correspond to this scop.
4483 * We first compute a schedule that respects the dependences
4484 * of the original program and select the outermost band
4485 * of tilable dimensions that has at least one parallel loop.
4486 * We then have three blocks of dimensions
4488 * H B G
4490 * The tilable band "B" is first tiled according to "tile" sizes, resulting
4491 * in
4493 * H T P G
4495 * For each iteration of the T loop and for each array, we compute
4496 * the array elements accessed by that iteration, construct a rectangular
4497 * box around it and shift it to the origin. The result is used
4498 * as shared memory for the array.
4500 * We then split off at most 2 parallel loops from the T loops and
4501 * at most 3 parallel loops from the P loops
4503 * H T1 T2 P1 P2 G
4505 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4506 * according to "grid"/"block" sizes.
4508 * H T1T T1P T2 P1T P1P P2 G
4510 * Finally, the T1P and P1P iterators are equated to the block and
4511 * thread dimensions respectively and so are effectively removed.
4512 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4513 * are run on the GPU.
4515 * Code is generated in three stages. We first generate code for the
4516 * host (the H loops), with iterators h%d. Then, for each leaf node
4517 * of the resulting AST, we generate code for the shared loops (up to
4518 * and including T2), with iterators g%d and after equating the H loops
4519 * to h%d parameters and the T1P loops to the block dimensions.
4520 * Finally, we generate code for the remaining loops in a similar fashion.
4522 int generate_cuda(isl_ctx *ctx, struct pet_scop *scop,
4523 struct ppcg_options *options, const char *input)
4525 isl_union_map *sched;
4526 struct cuda_gen gen;
4528 if (!scop)
4529 return -1;
4531 scop = pet_scop_align_params(scop);
4533 gen.ctx = ctx;
4534 gen.context = isl_set_copy(scop->context);
4535 gen.context = add_context_from_str(gen.context, options->ctx);
4536 gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4537 gen.n_stmts = scop->n_stmt;
4538 gen.stmts = extract_stmts(ctx, scop, gen.context);
4539 gen.read = pet_scop_collect_reads(scop);
4540 gen.write = pet_scop_collect_writes(scop);
4541 gen.options = options;
4542 gen.state = cloog_isl_state_malloc(gen.ctx);
4543 gen.scop = scop;
4545 cuda_open_files(&gen.cuda, input);
4547 collect_array_info(&gen);
4549 sched = pet_scop_collect_schedule(scop);
4551 compute_schedule(&gen, sched);
4553 print_host_code(&gen);
4555 cloog_state_free(gen.state);
4556 clear_cuda_gen(&gen);
4558 cuda_close_files(&gen.cuda);
4560 return 0;