Add support for the generation of c code
[ppcg.git] / cuda.c
blob9d33214b968e1965734b7e35b8f4165fc55f9759
1 /*
2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
8 * 91893 Orsay, France
9 */
11 #include <assert.h>
12 #include <stdlib.h>
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
16 #include <isl/aff.h>
17 #include <isl/ilp.h>
18 #include <isl/flow.h>
19 #include <isl/band.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
24 #include "cuda.h"
25 #include "cuda_common.h"
26 #include "clast_printer.h"
27 #include "schedule.h"
28 #include "pet_printer.h"
29 #include "ppcg_options.h"
31 /* The fields stride, shift and shift_map only contain valid information
32 * if shift != NULL.
33 * If so, they express that current index is such that if you add shift,
34 * then the result is always a multiple of stride.
35 * shift_map contains the mapping
37 * i -> (i + shift)/stride
39 struct cuda_array_bound {
40 isl_int size;
41 isl_aff *lb;
43 isl_int stride;
44 isl_aff *shift;
45 isl_basic_map *shift_map;
48 struct cuda_array_info;
50 /* A group of array references in a kernel that should be handled together.
51 * If private_bound is not NULL, then it is mapped to registers.
52 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
53 * Otherwise, it is accessed from global memory.
55 struct cuda_array_ref_group {
56 /* The references in this group access this array. */
57 struct cuda_array_info *array;
58 /* Position of this group in the list of reference groups of array. */
59 int nr;
61 /* The following fields are use during the construction of the groups.
62 * access is the combined access relation relative to the shared
63 * memory tiling.
64 * write is set if any access in the group is a write.
66 isl_map *access;
67 int write;
69 /* For each index, size and offset of piece in shared memory. */
70 struct cuda_array_bound *shared_bound;
72 /* For each index, size and offset of piece in private memory. */
73 struct cuda_array_bound *private_bound;
75 /* References in this group; point to elements of a linked list. */
76 int n_ref;
77 struct cuda_stmt_access **refs;
79 /* Last shared memory tile dimension that affects tile of this group. */
80 int last_shared;
81 /* Dimension at which copying to/from shared memory is printed.
82 * if >= 0, then the value is >= last_shared
83 * if -1, then the copying is done at the leaf level.
85 int print_shared_level;
88 struct cuda_array_info {
89 isl_space *dim;
90 /* Element type. */
91 char *type;
92 /* Element size. */
93 int size;
94 /* Name of the array. */
95 char *name;
96 /* Number of indices. */
97 unsigned n_index;
98 /* For each index, a bound on the array in that direction. */
99 isl_pw_aff **bound;
100 /* For each index, bound[i] specialized to the current kernel. */
101 isl_pw_aff **local_bound;
103 /* All references to this array; point to elements of a linked list. */
104 int n_ref;
105 struct cuda_stmt_access **refs;
107 /* The reference groups associated to this array. */
108 int n_group;
109 struct cuda_array_ref_group **groups;
111 /* For scalars, is this scalar read-only within the entire program? */
112 int read_only;
115 /* Print the name of the local copy of a given group of array references.
117 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
119 int global = 0;
121 if (group->private_bound)
122 fprintf(out, "private_");
123 else if (group->shared_bound)
124 fprintf(out, "shared_");
125 else
126 global = 1;
127 fprintf(out, "%s", group->array->name);
128 if (!global && group->array->n_group > 1)
129 fprintf(out, "_%d", group->nr);
132 /* Collect all references to the given array and store pointers to them
133 * in array->refs.
135 static void collect_references(struct cuda_gen *gen,
136 struct cuda_array_info *array)
138 int i;
139 int n;
141 n = 0;
142 for (i = 0; i < gen->n_stmts; ++i) {
143 struct cuda_stmt *stmt = &gen->stmts[i];
144 struct cuda_stmt_access *access;
146 for (access = stmt->accesses; access; access = access->next) {
147 const char *name;
148 name = isl_map_get_tuple_name(access->access,
149 isl_dim_out);
150 if (name && !strcmp(array->name, name))
151 n++;
155 array->n_ref = n;
156 array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
157 assert(array->refs);
159 n = 0;
160 for (i = 0; i < gen->n_stmts; ++i) {
161 struct cuda_stmt *stmt = &gen->stmts[i];
162 struct cuda_stmt_access *access;
164 for (access = stmt->accesses; access; access = access->next) {
165 const char *name;
166 name = isl_map_get_tuple_name(access->access,
167 isl_dim_out);
168 if (!name || strcmp(array->name, name))
169 continue;
171 array->refs[n++] = access;
176 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
178 int i;
179 struct cuda_array_bound *bound;
181 bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
182 assert(bound);
184 for (i = 0; i < n_index; ++i) {
185 isl_int_init(bound[i].size);
186 bound[i].lb = NULL;
187 isl_int_init(bound[i].stride);
188 bound[i].shift = NULL;
189 bound[i].shift_map = NULL;
192 return bound;
195 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
197 int j;
199 if (!bound)
200 return;
202 for (j = 0; j < n_index; ++j) {
203 isl_int_clear(bound[j].size);
204 isl_int_clear(bound[j].stride);
205 isl_aff_free(bound[j].lb);
206 isl_aff_free(bound[j].shift);
207 isl_basic_map_free(bound[j].shift_map);
209 free(bound);
212 static struct pet_array *find_array(struct pet_scop *scop,
213 __isl_keep isl_set *accessed)
215 int i;
216 isl_id *id;
218 id = isl_set_get_tuple_id(accessed);
220 for (i = 0; i < scop->n_array; ++i) {
221 isl_id *id_i;
223 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
224 isl_id_free(id_i);
225 if (id == id_i)
226 break;
228 isl_id_free(id);
230 return i < scop->n_array ? scop->arrays[i] : NULL;
233 /* Compute bounds on the host arrays based on the accessed elements
234 * and collect all references to the array.
236 * If the array is zero-dimensional, i.e., a scalar, we check
237 * whether it is read-only.
239 static int extract_array_info(__isl_take isl_set *array, void *user)
241 int i;
242 struct cuda_gen *gen = (struct cuda_gen *)user;
243 const char *name;
244 int n_index;
245 isl_pw_aff **bounds;
246 isl_pw_aff **local_bounds;
247 struct pet_array *pa;
249 n_index = isl_set_dim(array, isl_dim_set);
250 name = isl_set_get_tuple_name(array);
251 bounds = isl_alloc_array(isl_set_get_ctx(array),
252 isl_pw_aff *, n_index);
253 assert(bounds);
254 local_bounds = isl_calloc_array(isl_set_get_ctx(array),
255 isl_pw_aff *, n_index);
256 assert(local_bounds);
257 gen->array[gen->n_array].dim = isl_set_get_space(array);
258 gen->array[gen->n_array].name = strdup(name);
259 gen->array[gen->n_array].n_index = n_index;
260 gen->array[gen->n_array].bound = bounds;
261 gen->array[gen->n_array].local_bound = local_bounds;
263 pa = find_array(gen->scop, array);
264 assert(pa);
266 gen->array[gen->n_array].type = strdup(pa->element_type);
267 gen->array[gen->n_array].size = pa->element_size;
269 if (n_index == 0) {
270 isl_set *space;
271 isl_union_map *write;
272 int empty;
274 write = isl_union_map_copy(gen->write);
275 space = isl_set_universe(isl_set_get_space(array));
276 write = isl_union_map_intersect_range(write,
277 isl_union_set_from_set(space));
278 empty = isl_union_map_is_empty(write);
279 isl_union_map_free(write);
281 gen->array[gen->n_array].read_only = empty;
284 for (i = 0; i < n_index; ++i) {
285 isl_set *dom;
286 isl_local_space *ls;
287 isl_aff *one;
288 isl_pw_aff *bound;
289 isl_set *size = i == 0 ? array : pa->extent;
291 bound = isl_set_dim_max(isl_set_copy(size), i);
292 assert(bound);
293 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
294 ls = isl_local_space_from_space(isl_set_get_space(dom));
295 one = isl_aff_zero_on_domain(ls);
296 one = isl_aff_add_constant_si(one, 1);
297 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
298 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
300 bounds[i] = bound;
303 collect_references(gen, &gen->array[gen->n_array]);
305 gen->n_array++;
307 isl_set_free(array);
308 return 0;
311 void collect_array_info(struct cuda_gen *gen)
313 isl_union_set *arrays;
315 arrays = isl_union_map_range(isl_union_map_copy(gen->read));
316 arrays = isl_union_set_union(arrays,
317 isl_union_map_range(isl_union_map_copy(gen->write)));
318 arrays = isl_union_set_coalesce(arrays);
320 gen->n_array = isl_union_set_n_set(arrays);
321 gen->array = isl_alloc_array(gen->ctx,
322 struct cuda_array_info, gen->n_array);
323 assert(gen->array);
324 gen->n_array = 0;
325 isl_union_set_foreach_set(arrays, &extract_array_info, gen);
326 isl_union_set_free(arrays);
329 static void free_array_info(struct cuda_gen *gen)
331 int i, j;
333 for (i = 0; i < gen->n_array; ++i) {
334 int n_index = gen->array[i].n_index;
335 free(gen->array[i].type);
336 free(gen->array[i].name);
337 for (j = 0; j < n_index; ++j) {
338 isl_pw_aff_free(gen->array[i].bound[j]);
339 isl_pw_aff_free(gen->array[i].local_bound[j]);
341 isl_space_free(gen->array[i].dim);
342 free(gen->array[i].bound);
343 free(gen->array[i].local_bound);
344 free(gen->array[i].refs);
346 free(gen->array);
349 /* Check if a cuda array is a scalar. A scalar is a value that is not stored
350 * as an array or through a pointer reference, but as single data element. At
351 * the moment, scalars are represented as zero dimensional arrays.
353 static int cuda_array_is_scalar(struct cuda_array_info *array)
355 return (array->n_index == 0);
358 /* Is "array" a read-only scalar?
360 static int cuda_array_is_read_only_scalar(struct cuda_array_info *array)
362 return cuda_array_is_scalar(array) && array->read_only;
365 static void declare_device_arrays(struct cuda_gen *gen)
367 int i;
369 for (i = 0; i < gen->n_array; ++i) {
370 if (cuda_array_is_read_only_scalar(&gen->array[i]))
371 continue;
372 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
373 gen->array[i].type, gen->array[i].name);
375 fprintf(gen->cuda.host_c, "\n");
378 static void print_array_size(struct cuda_gen *gen, FILE *out,
379 struct cuda_array_info *array)
381 int i;
382 isl_printer *prn;
384 prn = isl_printer_to_file(gen->ctx, out);
385 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
386 for (i = 0; i < array->n_index; ++i) {
387 prn = isl_printer_print_str(prn, "(");
388 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
389 prn = isl_printer_print_str(prn, ") * ");
391 prn = isl_printer_print_str(prn, "sizeof(");
392 prn = isl_printer_print_str(prn, array->type);
393 prn = isl_printer_print_str(prn, ")");
394 isl_printer_free(prn);
397 static void allocate_device_arrays(struct cuda_gen *gen)
399 int i;
401 for (i = 0; i < gen->n_array; ++i) {
402 if (cuda_array_is_read_only_scalar(&gen->array[i]))
403 continue;
404 fprintf(gen->cuda.host_c,
405 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
406 gen->array[i].name);
407 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
408 fprintf(gen->cuda.host_c, "));\n");
410 fprintf(gen->cuda.host_c, "\n");
413 static void free_device_arrays(struct cuda_gen *gen)
415 int i;
417 for (i = 0; i < gen->n_array; ++i) {
418 if (cuda_array_is_read_only_scalar(&gen->array[i]))
419 continue;
420 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaFree(dev_%s));\n",
421 gen->array[i].name);
425 static void copy_arrays_to_device(struct cuda_gen *gen)
427 int i;
429 for (i = 0; i < gen->n_array; ++i) {
430 isl_space *dim;
431 isl_set *read_i;
432 int empty;
434 if (cuda_array_is_read_only_scalar(&gen->array[i]))
435 continue;
437 dim = isl_space_copy(gen->array[i].dim);
438 read_i = isl_union_set_extract_set(gen->copy_in, dim);
439 empty = isl_set_fast_is_empty(read_i);
440 isl_set_free(read_i);
441 if (empty)
442 continue;
444 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(dev_%s,",
445 gen->array[i].name);
447 if (cuda_array_is_scalar(&(gen->array[i])))
448 fprintf(gen->cuda.host_c, " &%s, ",
449 gen->array[i].name);
450 else
451 fprintf(gen->cuda.host_c, " %s, ", gen->array[i].name);
453 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
454 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice));\n");
456 fprintf(gen->cuda.host_c, "\n");
459 static void copy_arrays_from_device(struct cuda_gen *gen)
461 int i;
462 isl_union_set *write;
463 write = isl_union_map_range(isl_union_map_copy(gen->write));
465 for (i = 0; i < gen->n_array; ++i) {
466 isl_space *dim;
467 isl_set *write_i;
468 int empty;
470 dim = isl_space_copy(gen->array[i].dim);
471 write_i = isl_union_set_extract_set(write, dim);
472 empty = isl_set_fast_is_empty(write_i);
473 isl_set_free(write_i);
474 if (empty)
475 continue;
477 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(");
478 if (cuda_array_is_scalar(&gen->array[i]))
479 fprintf(gen->cuda.host_c, "&%s, ", gen->array[i].name);
480 else
481 fprintf(gen->cuda.host_c, "%s, ", gen->array[i].name);
482 fprintf(gen->cuda.host_c, "dev_%s, ", gen->array[i].name);
483 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
484 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost));\n");
487 isl_union_set_free(write);
488 fprintf(gen->cuda.host_c, "\n");
491 static void read_sizes_from_file(struct cuda_gen *gen, const char *filename,
492 int *sizes, int len)
494 int i;
495 FILE *file;
497 file = fopen(filename, "r");
498 if (!file)
499 return;
501 for (i = 0; i < len; ++i)
502 if (fscanf(file, "%d", &sizes[i]) < 1)
503 break;
505 fclose(file);
508 /* Internal data structure for extract_size_of_type.
509 * "type" specifies the name of the space that we want to extract.
510 * "res" is used to store the subset of that space.
512 struct ppcg_extract_size_data {
513 const char *type;
514 isl_set *res;
517 /* This function is called for each set in a union_set.
518 * If the name of the set matches data->type, we store the
519 * set in data->res.
521 static int extract_size_of_type(__isl_take isl_set *size, void *user)
523 struct ppcg_extract_size_data *data = user;
524 const char *name;
526 name = isl_set_get_tuple_name(size);
527 if (name && !strcmp(name, data->type)) {
528 data->res = size;
529 return -1;
532 isl_set_free(size);
533 return 0;
536 /* Given a union map { kernel[i] -> *[...] },
537 * return the range in the space called "type" for the kernel with
538 * sequence number "id".
540 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
541 const char *type, int id)
543 isl_space *space;
544 isl_set *dom;
545 isl_union_set *local_sizes;
546 struct ppcg_extract_size_data data = { type, NULL };
548 if (!sizes)
549 return NULL;
551 space = isl_union_map_get_space(sizes);
552 space = isl_space_set_from_params(space);
553 space = isl_space_add_dims(space, isl_dim_set, 1);
554 space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
555 dom = isl_set_universe(space);
556 dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
558 local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
559 isl_union_map_copy(sizes));
560 isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
561 isl_union_set_free(local_sizes);
562 return data.res;
565 /* Given a singleton set, extract the first (at most *len) elements
566 * of the single integer tuple into *sizes and update *len if needed.
568 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
570 int i;
571 int dim;
572 isl_int v;
574 if (!set)
575 return;
577 dim = isl_set_dim(set, isl_dim_set);
578 if (dim < *len)
579 *len = dim;
581 isl_int_init(v);
583 for (i = 0; i < *len; ++i) {
584 int ok;
586 ok = isl_set_plain_is_fixed(set, isl_dim_set, i, &v);
587 assert(ok);
589 sizes[i] = isl_int_get_si(v);
592 isl_int_clear(v);
594 isl_set_free(set);
597 /* Extract user specified "tile" sizes from the "sizes" command line option,
598 * defaulting to option->tile_size in each dimension.
600 static void read_tile_sizes(struct cuda_gen *gen)
602 int n;
603 isl_set *size;
605 gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
606 assert(gen->tile_size);
607 for (n = 0; n < gen->tile_len; ++n)
608 gen->tile_size[n] = gen->options->tile_size;
610 size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
611 read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
613 if (gen->n_parallel > gen->tile_len)
614 gen->n_parallel = gen->tile_len;
617 /* Extract user specified "block" sizes from the "sizes" command line option,
618 * after filling in some potentially useful defaults.
620 static void read_block_sizes(struct cuda_gen *gen)
622 int n;
623 isl_set *size;
625 n = gen->n_parallel;
626 gen->n_block = (n <= 3) ? n : 3;
627 switch (gen->n_block) {
628 case 1:
629 gen->block_dim[0] = 512;
630 break;
631 case 2:
632 gen->block_dim[0] = 32;
633 gen->block_dim[1] = 16;
634 break;
635 default:
636 gen->block_dim[0] = 32;
637 gen->block_dim[1] = 4;
638 gen->block_dim[2] = 4;
639 break;
642 size = extract_sizes(gen->sizes, "block", gen->kernel_id);
643 read_sizes_from_set(size, gen->block_dim, &gen->n_block);
646 /* Extract user specified "grid" sizes from the "sizes" command line option,
647 * after filling in some potentially useful defaults.
649 static void read_grid_sizes(struct cuda_gen *gen)
651 int n = gen->n_parallel;
652 isl_set *size;
654 gen->n_grid = (n <= 2) ? n : 2;
655 switch (gen->n_grid) {
656 case 1:
657 gen->grid_dim[0] = 32768;
658 break;
659 default:
660 gen->grid_dim[0] = 256;
661 gen->grid_dim[1] = 256;
662 break;
665 size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
666 read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
669 /* Extract user specified sizes from the "sizes" command line option
670 * after filling in some potentially useful defaults.
672 static void read_sizes(struct cuda_gen *gen)
674 read_tile_sizes(gen);
675 read_block_sizes(gen);
676 read_grid_sizes(gen);
679 static void free_stmts(struct cuda_stmt *stmts, int n)
681 int i;
683 for (i = 0; i < n; ++i) {
684 struct cuda_stmt_access *access, *next;
686 for (access = stmts[i].accesses; access; access = next) {
687 next = access->next;
688 isl_map_free(access->access);
689 free(access);
692 isl_set_free(stmts[i].domain);
694 free(stmts);
697 void clear_cuda_gen(struct cuda_gen *gen)
699 free_stmts(gen->stmts, gen->n_stmts);
700 free_array_info(gen);
701 isl_union_map_free(gen->sizes);
702 isl_set_free(gen->context);
703 isl_union_set_free(gen->copy_in);
704 isl_union_map_free(gen->sched);
705 isl_union_map_free(gen->read);
706 isl_union_map_free(gen->write);
709 static void print_reverse_list(FILE *out, int len, int *list)
711 int i;
713 if (len == 0)
714 return;
716 fprintf(out, "(");
717 for (i = 0; i < len; ++i) {
718 if (i)
719 fprintf(out, ", ");
720 fprintf(out, "%d", list[len - 1 - i]);
722 fprintf(out, ")");
725 static void print_kernel_launch(struct cuda_gen *gen,
726 __isl_keep isl_union_set *arrays)
728 int i;
729 int first = 1;
730 unsigned nparam;
731 isl_space *dim;
733 print_indent(gen->code.dst, gen->code.indent);
734 fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
735 gen->kernel_id, gen->kernel_id, gen->kernel_id);
736 fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
737 gen->kernel_id);
738 fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
739 gen->kernel_id);
741 for (i = 0; i < gen->n_array; ++i) {
742 isl_space *dim;
743 isl_set *arr;
744 int empty;
746 dim = isl_space_copy(gen->array[i].dim);
747 arr = isl_union_set_extract_set(arrays, dim);
748 empty = isl_set_fast_is_empty(arr);
749 isl_set_free(arr);
750 if (empty)
751 continue;
753 if (!first) {
754 fprintf(gen->code.dst, ", ");
755 fprintf(gen->cuda.kernel_c, ", ");
756 fprintf(gen->cuda.kernel_h, ", ");
759 if (cuda_array_is_read_only_scalar(&gen->array[i])) {
760 fprintf(gen->code.dst, "%s", gen->array[i].name);
761 fprintf(gen->cuda.kernel_c, "%s %s",
762 gen->array[i].type, gen->array[i].name);
763 fprintf(gen->cuda.kernel_h, "%s %s",
764 gen->array[i].type, gen->array[i].name);
765 } else {
766 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
767 fprintf(gen->cuda.kernel_c, "%s *%s",
768 gen->array[i].type, gen->array[i].name);
769 fprintf(gen->cuda.kernel_h, "%s *%s",
770 gen->array[i].type, gen->array[i].name);
773 first = 0;
776 dim = isl_union_set_get_space(arrays);
777 nparam = isl_space_dim(dim, isl_dim_param);
778 for (i = 0; i < nparam; ++i) {
779 const char *name = isl_space_get_dim_name(dim, isl_dim_param, i);
780 if (!first) {
781 fprintf(gen->code.dst, ", ");
782 fprintf(gen->cuda.kernel_c, ", ");
783 fprintf(gen->cuda.kernel_h, ", ");
785 fprintf(gen->code.dst, "%s", name);
786 fprintf(gen->cuda.kernel_c, "int %s", name);
787 fprintf(gen->cuda.kernel_h, "int %s", name);
788 first = 0;
790 isl_space_free(dim);
792 for (i = 0; i < gen->tile_first; ++i) {
793 if (!first) {
794 fprintf(gen->code.dst, ", ");
795 fprintf(gen->cuda.kernel_c, ", ");
796 fprintf(gen->cuda.kernel_h, ", ");
798 fprintf(gen->code.dst, "h%d", i);
799 fprintf(gen->cuda.kernel_c, "int h%d", i);
800 fprintf(gen->cuda.kernel_h, "int h%d", i);
801 first = 0;
804 fprintf(gen->code.dst, ");\n");
805 fprintf(gen->cuda.kernel_c, ")\n");
806 fprintf(gen->cuda.kernel_h, ");\n");
808 fprintf(gen->code.dst, "cudaCheckKernel();\n");
811 /* Construct a map from a domain of dimensionality "len"
812 * to a domain of dimensionality "len" + "tile_len" that tiles
813 * the "tile_len" coordinates starting at "first".
814 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
815 * "dim" prescribes the parameters.
817 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
818 int first, int tile_len, int *tile_size)
820 int i;
821 isl_int v;
822 isl_basic_map *bmap;
823 isl_constraint *c;
824 isl_local_space *ls;
826 isl_int_init(v);
828 dim = isl_space_add_dims(dim, isl_dim_in, len);
829 dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
830 bmap = isl_basic_map_universe(isl_space_copy(dim));
831 ls = isl_local_space_from_space(dim);
833 for (i = 0; i < len - tile_len; ++i) {
834 int j = i < first ? i : i + tile_len;
835 int k = i < first ? i : i + 2 * tile_len;
837 c = isl_equality_alloc(isl_local_space_copy(ls));
838 isl_int_set_si(v, -1);
839 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
840 isl_int_set_si(v, 1);
841 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
842 bmap = isl_basic_map_add_constraint(bmap, c);
845 for (i = 0; i < tile_len; ++i) {
846 c = isl_equality_alloc(isl_local_space_copy(ls));
847 isl_int_set_si(v, -1);
848 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
849 isl_int_set_si(v, tile_size[i]);
850 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
851 isl_int_set_si(v, 1);
852 isl_constraint_set_coefficient(c, isl_dim_out,
853 first + i + tile_len, v);
854 bmap = isl_basic_map_add_constraint(bmap, c);
856 c = isl_inequality_alloc(isl_local_space_copy(ls));
857 isl_int_set_si(v, 1);
858 isl_constraint_set_coefficient(c, isl_dim_out,
859 first + i + tile_len, v);
860 bmap = isl_basic_map_add_constraint(bmap, c);
862 c = isl_inequality_alloc(isl_local_space_copy(ls));
863 isl_int_set_si(v, -1);
864 isl_constraint_set_coefficient(c, isl_dim_out,
865 first + i + tile_len, v);
866 isl_int_set_si(v, tile_size[i] - 1);
867 isl_constraint_set_constant(c, v);
868 bmap = isl_basic_map_add_constraint(bmap, c);
871 isl_local_space_free(ls);
872 isl_int_clear(v);
874 return isl_map_from_basic_map(bmap);
877 /* Construct a map from a domain of dimensionality "len"
878 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
879 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
880 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
881 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
882 * that are projected out at the end.
883 * "dim" prescribes the parameters.
885 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
886 int first, int wrap_len, int *wrap_size)
888 int i;
889 isl_basic_map *bmap;
890 isl_constraint *c;
891 isl_local_space *ls;
893 dim = isl_space_add_dims(dim, isl_dim_in, len);
894 dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
895 bmap = isl_basic_map_universe(isl_space_copy(dim));
896 ls = isl_local_space_from_space(dim);
898 for (i = 0; i < len; ++i) {
899 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
901 c = isl_equality_alloc(isl_local_space_copy(ls));
902 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
903 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
904 bmap = isl_basic_map_add_constraint(bmap, c);
907 for (i = 0; i < wrap_len; ++i) {
908 c = isl_equality_alloc(isl_local_space_copy(ls));
909 isl_constraint_set_coefficient_si(c, isl_dim_out,
910 first + i, -1);
911 isl_constraint_set_coefficient_si(c, isl_dim_out,
912 first + wrap_len + i, 1);
913 isl_constraint_set_coefficient_si(c, isl_dim_out,
914 first + 2 * wrap_len + i, wrap_size[i]);
915 bmap = isl_basic_map_add_constraint(bmap, c);
917 c = isl_inequality_alloc(isl_local_space_copy(ls));
918 isl_constraint_set_coefficient_si(c, isl_dim_out,
919 first + wrap_len + i, 1);
920 bmap = isl_basic_map_add_constraint(bmap, c);
922 c = isl_inequality_alloc(isl_local_space_copy(ls));
923 isl_constraint_set_coefficient_si(c, isl_dim_out,
924 first + wrap_len + i, -1);
925 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
926 bmap = isl_basic_map_add_constraint(bmap, c);
929 isl_local_space_free(ls);
931 bmap = isl_basic_map_project_out(bmap, isl_dim_out,
932 first + 2 * wrap_len, wrap_len);
934 return isl_map_from_basic_map(bmap);
937 /* Add "n" parameters named prefix%d.
939 static __isl_give isl_set *add_params( __isl_take isl_set *set,
940 int n, const char *prefix)
942 int i;
943 unsigned nparam;
944 char name[20];
946 nparam = isl_set_dim(set, isl_dim_param);
947 set = isl_set_add_dims(set, isl_dim_param, n);
949 for (i = 0; i < n; ++i) {
950 snprintf(name, sizeof(name), "%s%d", prefix, i);
951 set = isl_set_set_dim_name(set, isl_dim_param,
952 nparam + i, name);
955 return set;
958 /* Equate the "n" dimensions of "set" starting at "first" to
959 * freshly created parameters named prefix%d.
961 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
962 int first, int n, const char *prefix)
964 int i;
965 unsigned nparam;
966 isl_int v;
967 isl_space *dim;
968 isl_basic_set *bset;
969 isl_constraint *c;
970 isl_local_space *ls;
972 nparam = isl_set_dim(set, isl_dim_param);
974 set = add_params(set, n, prefix);
976 dim = isl_set_get_space(set);
977 bset = isl_basic_set_universe(isl_space_copy(dim));
978 ls = isl_local_space_from_space(dim);
980 isl_int_init(v);
982 for (i = 0; i < n; ++i) {
983 c = isl_equality_alloc(isl_local_space_copy(ls));
984 isl_int_set_si(v, -1);
985 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
986 isl_int_set_si(v, 1);
987 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
988 bset = isl_basic_set_add_constraint(bset, c);
991 isl_int_clear(v);
992 isl_local_space_free(ls);
994 return isl_set_intersect(set, isl_set_from_basic_set(bset));
997 static __isl_give isl_set *parametrization(__isl_take isl_space *dim,
998 int len, int first, int n, const char *prefix)
1000 isl_set *set;
1002 dim = isl_space_add_dims(dim, isl_dim_set, len);
1003 set = isl_set_universe(dim);
1005 return parametrize(set, first, n, prefix);
1008 /* Tile the B loops over the tile sizes and then tile/wrap
1009 * the T1 loops over the blocks.
1011 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
1012 __isl_take isl_union_map *sched)
1014 isl_space *dim;
1015 isl_map *tiling, *block_tiling;
1017 dim = isl_union_map_get_space(sched);
1018 tiling = tile(isl_space_copy(dim), gen->untiled_len,
1019 gen->tile_first, gen->tile_len, gen->tile_size);
1021 if (gen->options->wrap)
1022 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1023 gen->tile_first, gen->n_grid, gen->grid_dim);
1024 else
1025 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1026 gen->tile_first, gen->n_grid, gen->grid_dim);
1028 gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1030 tiling = isl_map_apply_range(tiling, block_tiling);
1032 sched = isl_union_map_apply_range(sched,
1033 isl_union_map_from_map(tiling));
1035 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1037 return sched;
1040 static __isl_give isl_union_map *parametrize_tiled_schedule(
1041 struct cuda_gen *gen, __isl_take isl_union_map *sched)
1043 isl_space *dim;
1044 isl_set *par;
1046 dim = isl_union_map_get_space(sched);
1047 par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
1048 sched = isl_union_map_intersect_range(sched,
1049 isl_union_set_from_set(par));
1051 dim = isl_union_map_get_space(sched);
1052 par = parametrization(dim, gen->tiled_len,
1053 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1054 sched = isl_union_map_intersect_range(sched,
1055 isl_union_set_from_set(par));
1057 return sched;
1060 /* Tile/wrap the P1 loops over the threads.
1062 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
1063 __isl_take isl_union_map *sched)
1065 isl_space *dim;
1066 isl_map *tiling;
1067 isl_set *par;
1069 dim = isl_union_map_get_space(sched);
1071 if (gen->options->wrap)
1072 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1073 gen->shared_len, gen->n_block, gen->block_dim);
1074 else
1075 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1076 gen->shared_len, gen->n_block, gen->block_dim);
1077 gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1079 sched = isl_union_map_apply_range(sched,
1080 isl_union_map_from_map(tiling));
1082 par = parametrization(dim, gen->thread_tiled_len,
1083 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1084 gen->n_block, "t");
1085 sched = isl_union_map_intersect_range(sched,
1086 isl_union_set_from_set(par));
1088 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1090 return sched;
1093 /* If the user asked for it, scale the shared memory tile loops
1094 * (T1T and T2) of "sched" by gen->tile_size[i].
1095 * If we are not performing "wrapping", then additionally scale the T1P
1096 * loops by gen->grid_dim[i].
1098 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
1099 __isl_take isl_union_map *sched)
1101 int i;
1102 isl_space *dim;
1103 isl_basic_map *scale;
1104 isl_constraint *c;
1105 isl_local_space *ls;
1107 if (!gen->options->scale_tile_loops)
1108 return sched;
1110 dim = isl_union_map_get_space(sched);
1111 dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1112 dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1113 scale = isl_basic_map_universe(isl_space_copy(dim));
1114 ls = isl_local_space_from_space(dim);
1116 for (i = 0; i < gen->tiled_len; ++i) {
1117 int f = 1;
1119 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1120 f = gen->tile_size[i - gen->tile_first];
1121 if (!gen->options->wrap)
1122 f *= gen->grid_dim[i - gen->tile_first];
1123 } else if (i >= gen->tile_first + gen->n_grid &&
1124 i < gen->tile_first + gen->n_grid + gen->tile_len) {
1125 f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1128 c = isl_equality_alloc(isl_local_space_copy(ls));
1129 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1130 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1131 scale = isl_basic_map_add_constraint(scale, c);
1134 isl_local_space_free(ls);
1136 sched = isl_union_map_apply_range(sched,
1137 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1139 return sched;
1142 /* If we are not performing "wrapping" and if the user asked for it,
1143 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1145 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
1146 __isl_take isl_union_map *sched)
1148 int i;
1149 isl_space *dim;
1150 isl_basic_map *scale;
1151 isl_constraint *c;
1152 isl_local_space *ls;
1154 if (gen->options->wrap)
1155 return sched;
1156 if (!gen->options->scale_tile_loops)
1157 return sched;
1159 dim = isl_union_map_get_space(sched);
1160 dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1161 dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1162 scale = isl_basic_map_universe(isl_space_copy(dim));
1163 ls = isl_local_space_from_space(dim);
1165 for (i = 0; i < gen->thread_tiled_len; ++i) {
1166 int f = 1;
1168 if (i >= gen->shared_len &&
1169 i < gen->shared_len + gen->n_block)
1170 f = gen->block_dim[i - gen->shared_len];
1172 c = isl_equality_alloc(isl_local_space_copy(ls));
1173 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1174 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1175 scale = isl_basic_map_add_constraint(scale, c);
1178 isl_local_space_free(ls);
1180 sched = isl_union_map_apply_range(sched,
1181 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1183 return sched;
1186 /* If we are not performing "wrapping" and if the user asked for it,
1187 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1189 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
1190 __isl_take isl_union_map *sched, int len, int first, int n_tile)
1192 int i;
1193 isl_space *dim;
1194 isl_basic_map *scale;
1195 isl_constraint *c;
1196 isl_local_space *ls;
1198 if (gen->options->wrap)
1199 return sched;
1200 if (!gen->options->scale_tile_loops)
1201 return sched;
1203 dim = isl_union_map_get_space(sched);
1204 dim = isl_space_add_dims(dim, isl_dim_in, len);
1205 dim = isl_space_add_dims(dim, isl_dim_out, len);
1206 scale = isl_basic_map_universe(isl_space_copy(dim));
1207 ls = isl_local_space_from_space(dim);
1209 for (i = 0; i < len; ++i) {
1210 int f = 1;
1212 if (i >= first && i < first + n_tile)
1213 f = gen->block_dim[i - first];
1215 c = isl_equality_alloc(isl_local_space_copy(ls));
1216 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1217 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1218 scale = isl_basic_map_add_constraint(scale, c);
1221 isl_local_space_free(ls);
1223 sched = isl_union_map_apply_range(sched,
1224 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1226 return sched;
1229 /* If print_user_stmt is set, we want to print the statements ourselves,
1230 * instead of relying on the C preprocessor. If so, we need to use
1231 * the stop option so that the domains will be saved on the statement
1232 * nodes.
1234 static void print_cloog_shared_body(struct cuda_gen *gen,
1235 __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1236 void (*print_user_stmt)(struct clast_printer_info *info,
1237 struct clast_user_stmt *s),
1238 int first_unroll)
1240 int i;
1241 CloogOptions *options;
1242 CloogDomain *cloog_context;
1243 CloogUnionDomain *ud;
1244 CloogInput *input;
1245 struct clast_stmt *stmt;
1246 char name[20];
1248 sched = isl_union_map_copy(sched);
1249 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
1251 options = cloog_options_malloc(gen->state);
1252 options->language = CLOOG_LANGUAGE_C;
1253 options->strides = 1;
1254 options->sh = 1;
1255 options->f = len;
1256 options->l = -1;
1257 options->override = 1;
1258 options->save_domains = 1;
1259 options->noscalars = 1;
1260 options->first_unroll = first_unroll;
1262 ud = cloog_union_domain_from_isl_union_map(sched);
1263 for (i = 0; i < len; ++i) {
1264 snprintf(name, sizeof(name), "c%d", i);
1265 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1267 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1268 input = cloog_input_alloc(cloog_context, ud);
1270 stmt = cloog_clast_create_from_input(input, options);
1272 gen->stmt_code.indent = gen->kernel_code.indent;
1273 gen->stmt_code.dst = gen->cuda.kernel_c;
1274 gen->stmt_code.print_user_stmt = print_user_stmt;
1275 gen->stmt_code.print_user_stmt_list = NULL;
1276 gen->stmt_code.print_for_head = NULL;
1277 gen->stmt_code.print_for_foot = NULL;
1278 gen->stmt_code.user = gen;
1279 print_clast(&gen->stmt_code, stmt);
1281 cloog_clast_free(stmt);
1282 cloog_options_free(options);
1285 /* Add "len" parameters p[i] called prefix%d,
1286 * with bounds to 0 <= p[i] < size[i].
1288 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1289 int len, int *size, const char *prefix)
1291 int i;
1292 unsigned nparam;
1293 isl_int v;
1294 isl_space *dim;
1295 isl_basic_set *bset;
1296 isl_constraint *c;
1297 isl_local_space *ls;
1298 char name[20];
1300 nparam = isl_set_dim(set, isl_dim_param);
1301 set = isl_set_add_dims(set, isl_dim_param, len);
1303 for (i = 0; i < len; ++i) {
1304 snprintf(name, sizeof(name), "%s%d", prefix, i);
1305 set = isl_set_set_dim_name(set, isl_dim_param,
1306 nparam + i, name);
1309 dim = isl_set_get_space(set);
1310 bset = isl_basic_set_universe(isl_space_copy(dim));
1311 ls = isl_local_space_from_space(dim);
1313 isl_int_init(v);
1315 for (i = 0; i < len; ++i) {
1316 c = isl_inequality_alloc(isl_local_space_copy(ls));
1317 isl_int_set_si(v, 1);
1318 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1319 bset = isl_basic_set_add_constraint(bset, c);
1321 c = isl_inequality_alloc(isl_local_space_copy(ls));
1322 isl_int_set_si(v, -1);
1323 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1324 isl_int_set_si(v, size[i] - 1);
1325 isl_constraint_set_constant(c, v);
1326 bset = isl_basic_set_add_constraint(bset, c);
1329 isl_int_clear(v);
1330 isl_local_space_free(ls);
1332 return isl_set_intersect(set, isl_set_from_basic_set(bset));
1335 static void print_shared_body(struct cuda_gen *gen,
1336 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1337 int len, void (*print_user_stmt)(struct clast_printer_info *info,
1338 struct clast_user_stmt *s),
1339 int first_unroll)
1341 isl_set *context;
1343 context = isl_set_copy(shared_domain);
1344 context = parametrize(context, 0, gen->shared_len, "g");
1345 context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1346 context = add_bounded_parameters(context,
1347 gen->n_block, gen->block_dim, "t");
1349 print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1350 first_unroll);
1352 isl_set_free(context);
1355 /* Given a tile of an array, construct a map that maps each element
1356 * of the tile to a copy of the tile shifted to the origin
1357 * (based on the lower bounds in group->private_bound or group->shared_bound).
1358 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1359 * is applied to the index first.
1360 * The domain of the resulting map is "access",
1361 * while the range space is anonymous.
1363 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1364 struct cuda_array_ref_group *group)
1366 int i;
1367 isl_space *dim;
1368 isl_basic_set *bset;
1369 isl_basic_map *bmap;
1370 isl_aff *lb;
1371 isl_basic_set *offset;
1372 isl_basic_map *shift;
1373 isl_basic_map *pre_shift;
1374 isl_map *sched;
1375 const char *name;
1376 struct cuda_array_bound *bounds;
1377 int n_index = group->array->n_index;
1379 bounds = group->private_bound;
1380 if (!bounds)
1381 bounds = group->shared_bound;
1383 dim = isl_set_get_space(access);
1384 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1385 offset = isl_basic_set_universe(dim);
1386 for (i = 0; i < n_index; ++i) {
1387 lb = isl_aff_copy(bounds[i].lb);
1388 bmap = isl_basic_map_from_aff(lb);
1389 bset = isl_basic_map_range(bmap);
1390 offset = isl_basic_set_flat_product(offset, bset);
1392 offset = isl_basic_set_neg(offset);
1394 dim = isl_space_map_from_set(isl_set_get_space(access));
1395 shift = isl_basic_map_identity(dim);
1396 shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1398 bset = isl_basic_set_universe(isl_set_get_space(access));
1399 bmap = isl_basic_map_from_domain_and_range(bset, offset);
1401 shift = isl_basic_map_sum(shift, bmap);
1403 dim = isl_set_get_space(access);
1404 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1405 dim = isl_space_map_from_set(dim);
1406 pre_shift = isl_basic_map_universe(isl_space_copy(dim));
1407 dim = isl_space_add_dims(dim, isl_dim_in, 1);
1408 dim = isl_space_add_dims(dim, isl_dim_out, 1);
1409 for (i = 0; i < n_index; ++i) {
1410 if (!bounds[i].shift_map)
1411 bmap = isl_basic_map_identity(isl_space_copy(dim));
1412 else
1413 bmap = isl_basic_map_copy(bounds[i].shift_map);
1414 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1416 isl_space_free(dim);
1417 name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1418 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1419 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1420 shift = isl_basic_map_apply_range(pre_shift, shift);
1422 sched = isl_map_from_basic_map(shift);
1423 sched = isl_map_intersect_domain(sched, access);
1425 return sched;
1428 /* Construct a schedule for iterating over all elements in the given
1429 * piece of an array. The schedule iterates over a copy of the piece
1430 * that is shifted to the origin.
1431 * We subsequently also perform the tiling/wrapping over the threads.
1433 * In particular, we tile the final iterators so that the final thread
1434 * dimension runs over the final array dimension.
1435 * However, if those final iterators have only a single iteration,
1436 * we try to tile earlier iterators instead.
1438 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1439 __isl_take isl_set *access, struct cuda_array_ref_group *group)
1441 isl_space *dim;
1442 isl_map *sched;
1443 isl_union_map *usched;
1444 isl_map *tiling;
1445 isl_set *par;
1446 unsigned nvar = isl_set_dim(access, isl_dim_set);
1447 int n_tile;
1448 int first;
1450 sched = shift_access(access, group);
1452 n_tile = gen->n_block;
1453 if (n_tile > nvar) {
1454 int i;
1455 sched = isl_map_insert_dims(sched,
1456 isl_dim_out, 0, n_tile - nvar);
1457 for (i = 0; i < n_tile - nvar; ++i)
1458 sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1459 nvar = n_tile;
1462 first = nvar - n_tile;
1464 for (; first > 0; first --)
1465 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1466 first + n_tile - 1, NULL))
1467 break;
1469 dim = isl_map_get_space(sched);
1470 dim = isl_space_params(dim);
1471 if (gen->options->wrap)
1472 tiling = wrap(isl_space_copy(dim), nvar, first,
1473 n_tile, gen->block_dim);
1474 else
1475 tiling = tile(isl_space_copy(dim), nvar, first,
1476 n_tile, gen->block_dim);
1477 sched = isl_map_apply_range(sched, tiling);
1479 par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1480 usched = isl_union_map_from_map(sched);
1481 usched = isl_union_map_intersect_range(usched,
1482 isl_union_set_from_set(par));
1484 usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1485 first, n_tile);
1487 return usched;
1490 /* Print an access to the element in the global memory copy of the
1491 * given array that corresponds to the element described by "pma".
1492 * of the original array.
1493 * The copy in global memory has been linearized, so we need to take
1494 * the array size into account.
1496 static void print_global_index(FILE *out,
1497 struct cuda_array_info *array, __isl_keep isl_pw_multi_aff *pma,
1498 __isl_keep isl_set *domain)
1500 int i;
1501 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1502 isl_printer *prn;
1504 if (cuda_array_is_scalar(array)) {
1505 fprintf(out, "*%s", array->name);
1506 return;
1509 fprintf(out, "%s[", array->name);
1510 prn = isl_printer_to_file(ctx, out);
1511 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1512 for (i = 0; i + 1 < array->n_index; ++i)
1513 prn = isl_printer_print_str(prn, "(");
1514 for (i = 0; i < array->n_index; ++i) {
1515 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1516 pa = isl_pw_aff_coalesce(pa);
1517 pa = isl_pw_aff_gist(pa, isl_set_copy(domain));
1518 if (i) {
1519 prn = isl_printer_print_str(prn, ") * (");
1520 prn = isl_printer_print_pw_aff(prn,
1521 array->local_bound[i]);
1522 prn = isl_printer_print_str(prn, ") + ");
1524 prn = isl_printer_print_pw_aff(prn, pa);
1525 isl_pw_aff_free(pa);
1527 isl_printer_free(prn);
1528 fprintf(out, "]");
1531 /* Given an index expression into a tile of an array, adjust the expression
1532 * to a shift of the tile to the origin
1533 * (based on the lower bounds in array->shared_bound).
1534 * If the index is strided, then we first add
1535 * bound->shift and divide by bound->stride.
1537 static __isl_give isl_pw_aff *shift_index(__isl_take isl_pw_aff *pa,
1538 struct cuda_array_info *array,
1539 struct cuda_array_bound *bound, __isl_take isl_set *domain)
1541 isl_aff *lb;
1542 isl_pw_aff *tmp;
1544 if (bound->shift) {
1545 isl_aff *shift;
1546 shift = bound->shift;
1547 shift = isl_aff_copy(shift);
1548 shift = isl_aff_project_domain_on_params(shift);
1549 shift = isl_aff_align_params(shift, isl_pw_aff_get_space(pa));
1550 tmp = isl_pw_aff_alloc(isl_set_copy(domain), shift);
1551 pa = isl_pw_aff_add(pa, tmp);
1552 pa = isl_pw_aff_scale_down(pa, bound->stride);
1555 lb = isl_aff_copy(bound->lb);
1556 lb = isl_aff_project_domain_on_params(lb);
1558 lb = isl_aff_align_params(lb, isl_pw_aff_get_space(pa));
1560 tmp = isl_pw_aff_alloc(isl_set_copy(domain), lb);
1561 pa = isl_pw_aff_sub(pa, tmp);
1562 pa = isl_pw_aff_coalesce(pa);
1563 pa = isl_pw_aff_gist(pa, domain);
1565 return pa;
1568 /* Print an access to the element in the private/shared memory copy of the
1569 * given array reference group that corresponds to the element described
1570 * by "pma" of the original array.
1571 * Since the array in private/shared memory is just a shifted copy of part
1572 * of the original array, we simply need to subtract the lower bound,
1573 * which was computed in can_tile_for_shared_memory.
1574 * If any of the indices is strided, then we first add
1575 * bounds[i].shift and divide by bounds[i].stride.
1577 static void print_local_index(FILE *out,
1578 struct cuda_array_ref_group *group, struct cuda_array_bound *bounds,
1579 __isl_keep isl_pw_multi_aff *pma, __isl_keep isl_set *domain)
1581 int i;
1582 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1583 isl_printer *prn;
1584 struct cuda_array_info *array = group->array;
1586 print_array_name(out, group);
1587 for (i = 0; i < array->n_index; ++i) {
1588 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1590 pa = shift_index(pa, array, &bounds[i], isl_set_copy(domain));
1592 fprintf(out, "[");
1593 prn = isl_printer_to_file(ctx, out);
1594 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1595 prn = isl_printer_print_pw_aff(prn, pa);
1596 isl_printer_free(prn);
1597 fprintf(out, "]");
1598 isl_pw_aff_free(pa);
1602 /* This function is called for each leaf in the clast of the code
1603 * for copying to or from shared/private memory.
1604 * The statement name is {read,write}_{shared,private}_<array>.
1606 * The schedule iterates over the array elements, so we can use
1607 * the domain of copy_sched at the current scheduling position
1608 * as the index of the array.
1610 static void print_copy_statement(struct clast_printer_info *code,
1611 struct clast_user_stmt *u)
1613 struct cuda_gen *gen = code->user;
1614 isl_set *domain;
1615 isl_map *sched;
1616 struct cuda_array_ref_group *group = gen->copy_group;
1617 struct cuda_array_bound *bounds = gen->copy_bound;
1618 int i;
1619 unsigned n_in;
1620 unsigned n_out;
1621 isl_space *dim;
1622 isl_set *param;
1623 isl_set *index;
1624 isl_pw_multi_aff *pma;
1625 int read;
1627 read = !strncmp(u->statement->name, "read", 4);
1629 domain = extract_host_domain(u);
1630 assert(domain);
1632 sched = isl_map_copy(gen->copy_sched);
1633 sched = isl_map_reverse(sched);
1634 sched = isl_map_intersect_domain(sched, domain);
1635 n_in = isl_map_dim(sched, isl_dim_in);
1636 n_out = isl_map_dim(sched, isl_dim_out);
1637 dim = isl_map_get_space(sched);
1638 dim = isl_space_drop_dims(dim, isl_dim_in, 0, n_in);
1639 dim = isl_space_drop_dims(dim, isl_dim_out, 0, n_out);
1640 param = parametrization(dim, n_in, 0, n_in, "c");
1641 sched = isl_map_align_params(sched, isl_set_get_space(param));
1642 sched = isl_map_intersect_domain(sched, param);
1643 index = isl_map_range(sched);
1644 domain = isl_set_copy(index);
1645 pma = isl_pw_multi_aff_from_set(index);
1646 pma = isl_pw_multi_aff_coalesce(pma);
1647 domain = isl_set_params(domain);
1649 print_indent(code->dst, code->indent);
1650 if (read) {
1651 print_local_index(code->dst, group, bounds, pma, domain);
1652 fprintf(code->dst, " = ");
1653 print_global_index(code->dst, group->array, pma, domain);
1654 } else {
1655 print_global_index(code->dst, group->array, pma, domain);
1656 fprintf(code->dst, " = ");
1657 print_local_index(code->dst, group, bounds, pma, domain);
1659 fprintf(code->dst, ";\n");
1661 isl_pw_multi_aff_free(pma);
1662 isl_set_free(domain);
1665 static void print_shared_access(struct cuda_gen *gen,
1666 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1667 const char *type, struct cuda_array_ref_group *group)
1669 const char *array_name;
1670 char *name;
1671 isl_ctx *ctx;
1672 isl_union_map *sched;
1673 unsigned nvar = isl_set_dim(access, isl_dim_set);
1674 int n_tile;
1676 ctx = isl_set_get_ctx(access);
1677 array_name = isl_set_get_tuple_name(access);
1678 name = isl_alloc_array(ctx, char,
1679 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1680 if (group->array->n_group > 1)
1681 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1682 else
1683 sprintf(name, "%s_shared_%s", type, array_name);
1684 access = isl_set_set_tuple_name(access, name);
1685 free(name);
1687 sched = access_schedule(gen, access, group);
1689 n_tile = gen->n_block;
1690 if (n_tile > nvar)
1691 n_tile = nvar;
1693 gen->copy_sched = isl_map_from_union_map(isl_union_map_copy(sched));
1694 gen->copy_group = group;
1695 gen->copy_bound = group->shared_bound;
1697 print_shared_body(gen, shared_domain, sched, nvar + n_tile,
1698 &print_copy_statement, -1);
1700 isl_union_map_free(sched);
1701 isl_map_free(gen->copy_sched);
1704 /* Return the union of all read (read = 1) and/or write (write = 1)
1705 * access relations in the group.
1707 static __isl_give isl_union_map *group_access_relation(
1708 struct cuda_array_ref_group *group, int read, int write)
1710 int i;
1711 isl_union_map *access;
1713 access = isl_union_map_empty(isl_map_get_space(group->access));
1714 for (i = 0; i < group->n_ref; ++i) {
1715 isl_map *map_i;
1717 if (!((read && group->refs[i]->read) ||
1718 (write && group->refs[i]->write)))
1719 continue;
1720 map_i = isl_map_copy(group->refs[i]->access);
1721 access = isl_union_map_union(access,
1722 isl_union_map_from_map(map_i));
1725 return access;
1728 /* Check that none of the shared memory tiles involve any strides.
1730 static int no_strides(struct cuda_array_ref_group *group)
1732 int i;
1733 int n_index = group->array->n_index;
1735 for (i = 0; i < n_index; ++i)
1736 if (group->shared_bound[i].shift)
1737 return 0;
1739 return 1;
1742 /* Return a set containing the values of the given index i
1743 * of the elements in the array tile in global memory that corresponds
1744 * to the shared memory copy.
1745 * In particular, if a is the index, we return a set with constraints
1747 * tile_offset <= a <= tile_offset + tile_size - 1
1749 * and
1751 * 0 <= a <= array_size - 1
1754 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1755 int i)
1757 isl_basic_set *tile;
1758 isl_aff *aff;
1759 isl_constraint *c;
1760 isl_local_space *ls;
1761 isl_pw_aff *bound;
1762 isl_set *dom;
1763 isl_set *tile_set;
1765 aff = isl_aff_copy(group->shared_bound[i].lb);
1766 aff = isl_aff_add_dims(aff, isl_dim_in, 1);
1767 ls = isl_aff_get_domain_local_space(aff);
1768 aff = isl_aff_neg(aff);
1769 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1770 c = isl_inequality_from_aff(isl_aff_copy(aff));
1771 tile = isl_basic_set_from_constraint(c);
1773 aff = isl_aff_neg(aff);
1774 aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1775 aff = isl_aff_add_constant_si(aff, -1);
1776 c = isl_inequality_from_aff(aff);
1777 tile = isl_basic_set_add_constraint(tile, c);
1779 aff = isl_aff_zero_on_domain(ls);
1780 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1781 c = isl_inequality_from_aff(aff);
1782 tile = isl_basic_set_add_constraint(tile, c);
1784 bound = isl_pw_aff_copy(group->array->bound[i]);
1785 bound = isl_pw_aff_add_dims(bound, isl_dim_in, 1);
1786 ls = isl_local_space_from_space(isl_pw_aff_get_domain_space(bound));
1787 aff = isl_aff_zero_on_domain(ls);
1788 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1789 aff = isl_aff_add_constant_si(aff, 1);
1790 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1792 tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1793 tile_set = isl_set_align_params(tile_set, isl_basic_set_get_space(tile));
1794 tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1796 return tile_set;
1799 /* Return a set containing the elements in the array tile in
1800 * global memory that corresponds to the shared memory copy.
1802 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1804 int i;
1805 int n_index = group->array->n_index;
1806 isl_set *tile;
1808 tile = group_tile_dim(group, 0);
1809 for (i = 1; i < n_index; ++i) {
1810 isl_set *tile_i;
1812 tile_i = group_tile_dim(group, i);
1813 tile = isl_set_flat_product(tile, tile_i);
1816 tile = isl_set_set_tuple_name(tile, group->array->name);
1818 return tile;
1821 /* Print code for reading into or writing from shared memory
1822 * the given array reference group.
1824 * sched maps the original iteration domains to the shared memory tile loops.
1826 * If we are performing a read from global memory to shared memory,
1827 * if the array involved is not a scalar and if the definition of the
1828 * shared memory tiles does not involve any strides, then we copy
1829 * the entire tile to shared memory. This may result in some extra
1830 * elements getting copied, but it should lead to simpler code
1831 * (which means that fewer registers may be needed) and less divergence.
1833 * Otherwise, we only copy the elements that will be read or have been written
1834 * in the kernel.
1836 * Note that the absence of stride requirement can easily be lifted.
1837 * We would just need to add constraints of the form
1839 * shift + a = stride * alpha
1841 static int print_group_shared_accesses(struct cuda_gen *gen,
1842 struct cuda_array_ref_group *group, const char *type,
1843 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1845 int read;
1846 isl_union_map *access;
1847 isl_union_set *uset;
1848 isl_set *access_set;
1850 if (group->private_bound)
1851 return 0;
1852 if (!group->shared_bound)
1853 return 0;
1855 read = !strcmp(type, "read");
1857 access = group_access_relation(group, read, !read);
1858 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1859 uset = isl_union_map_range(access);
1861 if (isl_union_set_is_empty(uset)) {
1862 isl_union_set_free(uset);
1863 return 0;
1866 if (read && group->array->n_index > 0 && no_strides(group)) {
1867 isl_union_set_free(uset);
1868 access_set = group_tile(group);
1869 print_shared_access(gen, shared_domain, access_set,
1870 type, group);
1871 return 1;
1874 access_set = isl_set_from_union_set(uset);
1875 access_set = isl_set_coalesce(access_set);
1877 print_shared_access(gen, shared_domain, access_set, type, group);
1879 return 1;
1882 /* Print code for reading into or writing from shared memory at
1883 * the given level (-1 for innermost).
1885 * If we are not printing at the innermost level, then the dimensionality
1886 * of shared_domain may be smaller than gen->shared_len.
1887 * As the rest of the code assumes that the domain of access has
1888 * gen->shared_len dimensions, we therefore may need to embed this domain
1889 * in a higher dimensional space after intersection with shared_domain.
1891 static void print_shared_accesses(struct cuda_gen *gen,
1892 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1893 const char *type, int level)
1895 int i, j;
1896 isl_space *dim;
1897 isl_map *proj;
1898 isl_set *par;
1899 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1900 int sync = 0;
1901 isl_union_map *sched;
1903 shared_domain = isl_set_copy(shared_domain);
1904 sched = isl_union_map_copy(gen->tiled_sched);
1905 dim = isl_union_map_get_space(sched);
1906 proj = projection(dim, gen->tiled_len, shared_len);
1907 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1908 sched = isl_union_map_intersect_range(sched,
1909 isl_union_set_from_set(isl_set_copy(shared_domain)));
1910 if (shared_len != gen->shared_len) {
1911 dim = isl_union_map_get_space(sched);
1912 proj = projection(dim, gen->shared_len, shared_len);
1913 proj = isl_map_reverse(proj);
1914 shared_domain = isl_set_apply(shared_domain,
1915 isl_map_copy(proj));
1916 sched = isl_union_map_apply_range(sched,
1917 isl_union_map_from_map(proj));
1920 dim = isl_union_map_get_space(sched);
1921 par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1922 sched = isl_union_map_intersect_range(sched,
1923 isl_union_set_from_set(par));
1925 for (i = 0; i < gen->n_array; ++i) {
1926 struct cuda_array_info *array = &gen->array[i];
1928 for (j = 0; j < array->n_group; ++j) {
1929 if (array->groups[j]->print_shared_level != level)
1930 continue;
1932 if (print_group_shared_accesses(gen, array->groups[j],
1933 type, shared_domain, sched))
1934 sync = 1;
1938 isl_union_map_free(sched);
1939 isl_set_free(shared_domain);
1941 if (sync) {
1942 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1943 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1947 /* This function is called for each access to an array in some statement
1948 * in the original code.
1949 * Replace that access by an access to shared or (linearized) global memory.
1950 * Since the array in shared memory is just
1951 * a shifted copy of part of the original array, we simply need
1952 * to subtract the lower bound, which was computed
1953 * in can_tile_for_shared_memory.
1954 * If any of the indices is strided, then we first add
1955 * shared_bound[i].shift and divide by shared_bound[i].stride.
1957 * If the given array is accessed directly from global memory,
1958 * we don't need to perform any shifting and simply simplify
1959 * expression in the context of the domain instead.
1961 * If the array space (range of access) has no name, then we are
1962 * accessing an iterator in the original program.
1964 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1965 int group_nr)
1967 int i;
1968 const char *name;
1969 unsigned n_index;
1970 struct cuda_array_info *array = NULL;
1971 isl_printer *prn;
1972 isl_pw_multi_aff *pma;
1973 isl_set *data_set;
1974 isl_set *domain;
1975 struct cuda_array_bound *bounds = NULL;
1977 access = isl_map_align_params(access,
1978 isl_set_get_space(gen->stmt_domain));
1980 data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1982 name = isl_set_get_tuple_name(data_set);
1984 if (!name)
1985 fprintf(gen->cuda.kernel_c, "(");
1986 else {
1987 struct cuda_array_ref_group *group;
1989 for (i = 0; i < gen->n_array; ++i) {
1990 if (strcmp(name, gen->array[i].name))
1991 continue;
1992 array = &gen->array[i];
1994 assert(array);
1995 group = array->groups[group_nr];
1996 bounds = group->private_bound;
1997 if (!bounds)
1998 bounds = group->shared_bound;
2000 if (!bounds && cuda_array_is_scalar(array) && !array->read_only)
2001 fprintf(gen->cuda.kernel_c, "*");
2002 print_array_name(gen->cuda.kernel_c, group);
2004 if (cuda_array_is_scalar(array)) {
2005 isl_set_free(data_set);
2006 return;
2009 fprintf(gen->cuda.kernel_c, "[");
2013 n_index = isl_set_dim(data_set, isl_dim_set);
2014 pma = isl_pw_multi_aff_from_set(data_set);
2015 pma = isl_pw_multi_aff_coalesce(pma);
2017 prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
2018 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
2020 if (!bounds)
2021 for (i = 0; i + 1 < n_index; ++i)
2022 prn = isl_printer_print_str(prn, "(");
2024 for (i = 0; i < n_index; ++i) {
2025 isl_pw_aff *index;
2027 index = isl_pw_multi_aff_get_pw_aff(pma, i);
2029 if (!array) {
2030 prn = isl_printer_print_pw_aff(prn, index);
2031 isl_pw_aff_free(index);
2032 continue;
2035 domain = isl_set_copy(gen->stmt_domain);
2036 domain = isl_set_params(domain);
2037 if (!bounds) {
2038 index = isl_pw_aff_coalesce(index);
2039 index = isl_pw_aff_gist(index, domain);
2040 } else
2041 index = shift_index(index, array, &bounds[i], domain);
2043 if (i) {
2044 if (!bounds) {
2045 prn = isl_printer_print_str(prn, ") * (");
2046 prn = isl_printer_print_pw_aff(prn,
2047 array->local_bound[i]);
2048 prn = isl_printer_print_str(prn, ") + ");
2049 } else
2050 prn = isl_printer_print_str(prn, "][");
2052 prn = isl_printer_print_pw_aff(prn, index);
2053 isl_pw_aff_free(index);
2055 if (!name)
2056 prn = isl_printer_print_str(prn, ")");
2057 else
2058 prn = isl_printer_print_str(prn, "]");
2059 isl_printer_free(prn);
2061 isl_pw_multi_aff_free(pma);
2064 struct cuda_access_print_info {
2065 struct cuda_gen *gen;
2066 struct cuda_stmt_access *access;
2069 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
2070 * with the pet printer. This means that whenever the pet printer prints a
2071 * pet access expression we have the corresponding cuda access available and can
2072 * print the modified access.
2074 static void print_cuda_access(struct pet_expr *expr, void *usr)
2076 struct cuda_access_print_info *info =
2077 (struct cuda_access_print_info *) usr;
2078 print_access(info->gen, isl_map_copy(info->access->access),
2079 info->access->group);
2080 info->access = info->access->next;
2083 static void print_stmt_body(struct cuda_gen *gen,
2084 FILE *out, struct cuda_stmt *stmt)
2086 struct cuda_access_print_info info;
2088 info.gen = gen;
2089 info.access = stmt->accesses;
2091 print_pet_expr(out, stmt->body, print_cuda_access, &info);
2092 fprintf(out, ";\n");
2095 /* This function is called for each leaf in the innermost clast,
2096 * i.e., for each statement.
2097 * We print the statement body, simplifying the accesses based
2098 * on the schedule.
2100 static void print_statement(struct clast_printer_info *code,
2101 struct clast_user_stmt *u)
2103 struct cuda_gen *gen = code->user;
2104 isl_space *dim;
2105 isl_set *par;
2106 isl_set *stmt_domain;
2107 isl_union_map *stmt_sched;
2108 isl_union_set *uset;
2109 int nr;
2110 struct cuda_stmt *stmt;
2112 nr = atoi(u->statement->name + 2);
2113 stmt = &gen->stmts[nr];
2115 stmt_domain = extract_host_domain(u);
2117 stmt_sched = isl_union_map_intersect_range(
2118 isl_union_map_copy(gen->local_sched),
2119 isl_union_set_from_set(extend(stmt_domain,
2120 gen->thread_tiled_len)));
2121 dim = isl_union_map_get_space(stmt_sched);
2122 par = parametrization(dim, gen->thread_tiled_len, 0,
2123 gen->thread_tiled_len, "c");
2124 stmt_sched = isl_union_map_intersect_range(stmt_sched,
2125 isl_union_set_from_set(par));
2127 uset = isl_union_map_domain(stmt_sched);
2128 dim = isl_union_set_get_space(uset);
2129 dim = isl_space_add_dims(dim, isl_dim_set,
2130 isl_set_dim(stmt->domain, isl_dim_set));
2131 dim = isl_space_set_tuple_name(dim, isl_dim_set, u->statement->name);
2132 gen->stmt_domain = isl_union_set_extract_set(uset, dim);
2133 isl_union_set_free(uset);
2135 print_indent(code->dst, code->indent);
2136 print_stmt_body(gen, code->dst, stmt);
2138 isl_set_free(gen->stmt_domain);
2141 static void print_private_access(struct cuda_gen *gen,
2142 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
2143 const char *type, struct cuda_array_ref_group *group)
2145 const char *array_name;
2146 char *name;
2147 isl_ctx *ctx;
2148 unsigned nvar = isl_set_dim(access, isl_dim_set);
2149 isl_union_map *usched;
2151 if (isl_set_fast_is_empty(access)) {
2152 isl_set_free(access);
2153 return;
2156 ctx = isl_set_get_ctx(access);
2157 array_name = isl_set_get_tuple_name(access);
2158 name = isl_alloc_array(ctx, char,
2159 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
2160 if (group->array->n_group > 1)
2161 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
2162 else
2163 sprintf(name, "%s_private_%s", type, array_name);
2164 access = isl_set_set_tuple_name(access, name);
2165 free(name);
2167 gen->copy_sched = shift_access(access, group);
2168 gen->copy_group = group;
2169 gen->copy_bound = group->private_bound;
2171 usched = isl_union_map_from_map(isl_map_copy(gen->copy_sched));
2172 print_shared_body(gen, shared_domain, usched, nvar,
2173 &print_copy_statement, 1);
2174 isl_union_map_free(usched);
2176 isl_map_free(gen->copy_sched);
2179 /* Print code for reading into or writing from private memory
2180 * the given array reference group.
2182 * sched maps the original iteration domains to the shared memory tile loops.
2184 static void print_group_private_accesses(struct cuda_gen *gen,
2185 struct cuda_array_ref_group *group,
2186 const char *type, __isl_keep isl_set *shared_domain,
2187 unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2189 int read;
2190 isl_union_map *access;
2191 isl_union_set *uset;
2192 isl_set *access_set;
2194 if (!group->private_bound)
2195 return;
2197 read = !strcmp(type, "read");
2199 access = group_access_relation(group, read, !read);
2200 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2201 access = isl_union_map_intersect(access,
2202 isl_union_map_copy(gen->private_access));
2203 uset = isl_union_map_range(access);
2205 if (isl_union_set_is_empty(uset)) {
2206 isl_union_set_free(uset);
2207 return;
2210 access_set = isl_set_from_union_set(uset);
2211 access_set = isl_set_coalesce(access_set);
2212 access_set = isl_set_eliminate(access_set, isl_dim_param,
2213 first_shared + shared_len,
2214 gen->shared_len - shared_len);
2216 print_private_access(gen, shared_domain, access_set, type, group);
2219 /* Print code for reading into or writing from private memory at
2220 * the given level (-1 for innermost).
2222 * If we are not printing at the innermost level, then the dimensionality
2223 * of shared_domain may be smaller than gen->shared_len.
2224 * As the rest of the code assumes that the domain of access has
2225 * gen->shared_len dimensions, we therefore may need to embed this domain
2226 * in a higher dimensional space after intersection with shared_domain.
2228 * This code is very similar to print_shared_accesses.
2229 * The main difference is that we to take into account gen->private_access.
2231 static void print_private_accesses(struct cuda_gen *gen,
2232 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2233 const char *type, int level)
2235 int i, j;
2236 isl_space *dim;
2237 isl_map *proj;
2238 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2239 unsigned first_shared;
2240 isl_union_map *sched;
2242 shared_domain = isl_set_copy(shared_domain);
2243 sched = isl_union_map_copy(gen->tiled_sched);
2244 dim = isl_union_map_get_space(sched);
2245 first_shared = isl_space_dim(dim, isl_dim_param);
2246 proj = projection(dim, gen->tiled_len, shared_len);
2247 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2248 sched = isl_union_map_intersect_range(sched,
2249 isl_union_set_from_set(isl_set_copy(shared_domain)));
2250 if (shared_len != gen->shared_len) {
2251 dim = isl_union_map_get_space(sched);
2252 proj = projection(dim, gen->shared_len, shared_len);
2253 proj = isl_map_reverse(proj);
2254 shared_domain = isl_set_apply(shared_domain,
2255 isl_map_copy(proj));
2256 sched = isl_union_map_apply_range(sched,
2257 isl_union_map_from_map(proj));
2260 for (i = 0; i < gen->n_array; ++i) {
2261 struct cuda_array_info *array = &gen->array[i];
2263 for (j = 0; j < array->n_group; ++j) {
2264 if (array->groups[j]->print_shared_level != level)
2265 continue;
2267 print_group_private_accesses(gen, array->groups[j],
2268 type, shared_domain,
2269 first_shared, shared_len, sched);
2273 isl_union_map_free(sched);
2274 isl_set_free(shared_domain);
2277 /* Set unroll[j] if the input dimension j is involved in
2278 * the index expression represented by bmap.
2280 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2282 int i, j;
2283 int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2284 int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2285 int *unroll = user;
2287 for (i = 0; i < n_out; ++i) {
2288 isl_constraint *c;
2289 int ok;
2291 ok = isl_basic_map_has_defining_equality(bmap,
2292 isl_dim_out, i, &c);
2293 assert(ok);
2294 for (j = 0; j < n_in; ++j)
2295 if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2296 unroll[j] = 1;
2297 isl_constraint_free(c);
2300 isl_basic_map_free(bmap);
2301 return 0;
2304 /* Given an array pos mapping input dimensions to the corresponding
2305 * output dimension, construct the corresponding map.
2307 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
2308 int *pos, int len)
2310 int i;
2311 isl_constraint *c;
2312 isl_basic_map *bmap;
2313 isl_local_space *ls;
2315 dim = isl_space_add_dims(dim, isl_dim_in, len);
2316 dim = isl_space_add_dims(dim, isl_dim_out, len);
2317 bmap = isl_basic_map_universe(isl_space_copy(dim));
2318 ls = isl_local_space_from_space(dim);
2320 for (i = 0; i < len; ++i) {
2321 c = isl_equality_alloc(isl_local_space_copy(ls));
2322 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2323 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2324 bmap = isl_basic_map_add_constraint(bmap, c);
2326 isl_local_space_free(ls);
2328 return isl_map_from_basic_map(bmap);
2331 /* Find all loops involved in any of the index expressions for any of
2332 * the private accesses, move them innermost and then mark them as
2333 * requiring unrolling by setting gen->first_unroll.
2334 * The loops involved should all be parallel because of the checks
2335 * we performed in check_private_group_access. Moving them innermost
2336 * is therefore a valid transformation.
2338 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2339 __isl_take isl_union_map *sched)
2341 int i, j;
2342 int unroll[gen->thread_tiled_len];
2343 int perm[gen->thread_tiled_len];
2344 isl_space *dim;
2345 isl_map *permute;
2346 int len = gen->shared_len + gen->n_parallel + gen->n_block;
2348 gen->first_unroll = -1;
2350 for (i = 0; i < gen->thread_tiled_len; ++i)
2351 unroll[i] = 0;
2352 for (i = 0; i < gen->n_array; ++i) {
2353 struct cuda_array_info *array = &gen->array[i];
2355 for (j = 0; j < array->n_group; ++j) {
2356 isl_union_map *access;
2357 isl_map *acc;
2359 if (!array->groups[j]->private_bound)
2360 continue;
2362 access = group_access_relation(array->groups[j], 1, 1);
2363 access = isl_union_map_apply_domain(access,
2364 isl_union_map_copy(sched));
2366 acc = isl_map_from_union_map(access);
2367 isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2369 isl_map_free(acc);
2373 for (i = 0; i < gen->shared_len; ++i)
2374 if (unroll[i])
2375 return sched;
2377 for (i = gen->shared_len; i < len; ++i)
2378 if (unroll[i])
2379 break;
2381 if (i >= len)
2382 return sched;
2384 for (i = len; i < gen->thread_tiled_len; ++i)
2385 if (unroll[i])
2386 return sched;
2388 j = 0;
2389 for (i = 0; i < gen->thread_tiled_len; ++i)
2390 if (!unroll[i])
2391 perm[i] = j++;
2392 gen->first_unroll = 1 + j;
2393 for (i = 0; i < len; ++i)
2394 if (unroll[i])
2395 perm[i] = j++;
2397 dim = isl_union_map_get_space(sched);
2398 permute = permutation(dim, perm, gen->thread_tiled_len);
2399 sched = isl_union_map_apply_range(sched,
2400 isl_union_map_from_map(permute));
2402 return sched;
2405 /* This function is called for each leaf in the clast of the kernel code.
2406 * We first specialize the schedule to the site of the leaf and
2407 * print code for reading into shared memory, performing the actual
2408 * computations and writing from shared memory, with the required
2409 * synchronizations.
2411 static void print_kernel_user(struct clast_printer_info *code,
2412 struct clast_user_stmt *u)
2414 struct cuda_gen *gen = code->user;
2415 isl_set *shared_domain;
2417 shared_domain = extract_entire_host_domain(&u->stmt);
2419 print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2421 print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2423 print_shared_body(gen, shared_domain, gen->local_sched,
2424 gen->thread_tiled_len, &print_statement,
2425 gen->first_unroll);
2427 print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2429 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2430 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2432 print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2434 isl_set_free(shared_domain);
2437 /* Check if we need to perform any copying to shared memory at this level
2438 * and if so, print the copying instructions.
2439 * Any array for which we are allowed to print copying instructions at
2440 * this level, but haven't done so already, is printed.
2442 static void copy_to_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2444 int i, j;
2445 int level;
2446 int print = 0;
2448 level = isl_set_dim(domain, isl_dim_set);
2450 for (i = 0; i < gen->n_array; ++i) {
2451 struct cuda_array_info *array = &gen->array[i];
2453 for (j = 0; j < array->n_group; ++j) {
2454 if (array->groups[j]->print_shared_level >= 0)
2455 continue;
2456 if (array->groups[j]->last_shared >= level)
2457 continue;
2458 array->groups[j]->print_shared_level = level;
2459 print = 1;
2463 if (print) {
2464 print_shared_accesses(gen, domain, gen->read, "read", level);
2465 print_private_accesses(gen, domain, gen->read, "read", level);
2470 /* This function is called for each for loop in the clast,
2471 * right after the opening brace has been printed.
2473 * Print copying instructions to shared or private memory if needed.
2475 static void print_kernel_for_head(struct clast_printer_info *code,
2476 struct clast_for *f)
2478 struct cuda_gen *gen = code->user;
2479 isl_set *domain;
2481 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2482 copy_to_local(gen, domain);
2484 isl_set_free(domain);
2487 /* Print instructions for copying from shared memory for each array
2488 * for which print_kernel_for_head has added copying instructions
2489 * to shared memory.
2491 static void copy_from_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2493 int i, j;
2494 int level;
2495 int print = 0;
2497 level = isl_set_dim(domain, isl_dim_set);
2499 for (i = 0; i < gen->n_array; ++i) {
2500 struct cuda_array_info *array = &gen->array[i];
2502 for (j = 0; j < array->n_group; ++j) {
2503 if (array->groups[j]->print_shared_level != level)
2504 continue;
2505 print = 1;
2506 break;
2508 if (print)
2509 break;
2512 if (print) {
2513 print_private_accesses(gen, domain, gen->write, "write", level);
2514 print_shared_accesses(gen, domain, gen->write, "write", level);
2518 /* This function is called for each for loop in the clast,
2519 * right before the closing brace is printed.
2521 * Print copying instructions from shared or private memory if needed.
2523 static void print_kernel_for_foot(struct clast_printer_info *code,
2524 struct clast_for *f)
2526 struct cuda_gen *gen = code->user;
2527 isl_set *domain;
2529 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2530 copy_from_local(gen, domain);
2532 isl_set_free(domain);
2535 /* Use CLooG to generate code for the outer gen->shared_first loops
2536 * of the local schedule "sched".
2537 * The pretty printing of this code is handled by print_clast,
2538 * which calls print_kernel_user for each iteration of the shared tile loops.
2540 static void print_cloog_kernel_body(struct cuda_gen *gen,
2541 __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2543 int i;
2544 CloogOptions *options;
2545 CloogDomain *cloog_context;
2546 CloogUnionDomain *ud;
2547 CloogInput *input;
2548 struct clast_stmt *stmt;
2549 char name[20];
2551 sched = isl_union_map_copy(sched);
2552 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
2554 options = cloog_options_malloc(gen->state);
2555 options->language = CLOOG_LANGUAGE_C;
2556 options->strides = 1;
2557 options->sh = 1;
2558 options->stop = gen->shared_len;
2559 options->f = gen->tiled_len;
2560 options->l = gen->tiled_len;
2561 options->save_domains = 1;
2562 options->noscalars = 1;
2564 ud = cloog_union_domain_from_isl_union_map(sched);
2565 for (i = 0; i < gen->shared_len; ++i) {
2566 snprintf(name, sizeof(name), "g%d", i);
2567 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2569 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2570 input = cloog_input_alloc(cloog_context, ud);
2572 stmt = cloog_clast_create_from_input(input, options);
2574 gen->kernel_code.indent = 4;
2575 gen->kernel_code.dst = gen->cuda.kernel_c;
2576 gen->kernel_code.print_user_stmt = NULL;
2577 gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2578 gen->kernel_code.print_for_head = &print_kernel_for_head;
2579 gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2580 gen->kernel_code.user = gen;
2581 copy_to_local(gen, context);
2582 print_clast(&gen->kernel_code, stmt);
2583 copy_from_local(gen, context);
2585 cloog_clast_free(stmt);
2586 cloog_options_free(options);
2589 static void print_kernel_iterators(struct cuda_gen *gen)
2591 int i;
2592 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2593 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2594 "threadIdx.z" };
2596 if (gen->n_grid > 0) {
2597 print_indent(gen->cuda.kernel_c, 4);
2598 fprintf(gen->cuda.kernel_c, "int ");
2599 for (i = 0; i < gen->n_grid; ++i) {
2600 if (i)
2601 fprintf(gen->cuda.kernel_c, ", ");
2602 fprintf(gen->cuda.kernel_c, "b%d = %s",
2603 i, block_dims[gen->n_grid - 1 - i]);
2605 fprintf(gen->cuda.kernel_c, ";\n");
2608 if (gen->n_block > 0) {
2609 print_indent(gen->cuda.kernel_c, 4);
2610 fprintf(gen->cuda.kernel_c, "int ");
2611 for (i = 0; i < gen->n_block; ++i) {
2612 if (i)
2613 fprintf(gen->cuda.kernel_c, ", ");
2614 fprintf(gen->cuda.kernel_c, "t%d = %s",
2615 i, thread_dims[gen->n_block - 1 - i]);
2617 fprintf(gen->cuda.kernel_c, ";\n");
2621 static void print_group_shared_array(struct cuda_gen *gen,
2622 struct cuda_array_ref_group *group)
2624 int j;
2625 struct cuda_array_bound *bounds;
2627 bounds = group->private_bound;
2628 if (!bounds)
2629 bounds = group->shared_bound;
2630 if (!bounds)
2631 return;
2633 print_indent(gen->cuda.kernel_c, 4);
2634 fprintf(gen->cuda.kernel_c, "%s%s ",
2635 group->private_bound ? "" : "__shared__ ", group->array->type);
2636 print_array_name(gen->cuda.kernel_c, group);
2637 for (j = 0; j < group->array->n_index; ++j) {
2638 fprintf(gen->cuda.kernel_c, "[");
2639 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2640 fprintf(gen->cuda.kernel_c, "]");
2642 fprintf(gen->cuda.kernel_c, ";\n");
2645 static void print_shared_arrays(struct cuda_gen *gen)
2647 int i, j;
2649 for (i = 0; i < gen->n_array; ++i) {
2650 struct cuda_array_info *array = &gen->array[i];
2652 for (j = 0; j < array->n_group; ++j)
2653 print_group_shared_array(gen, array->groups[j]);
2657 static void print_kernel_body(struct cuda_gen *gen,
2658 __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2660 isl_set *context;
2662 context = isl_set_copy(host_domain);
2663 context = parametrize(context, 0, gen->tile_first, "h");
2664 context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2665 context = add_bounded_parameters(context,
2666 gen->n_grid, gen->grid_dim, "b");
2668 print_kernel_iterators(gen);
2669 print_shared_arrays(gen);
2671 fprintf(gen->cuda.kernel_c, "\n");
2673 print_cloog_kernel_body(gen, context, sched);
2675 isl_set_free(context);
2678 /* Given a constraint
2680 * a(p,i) + j = g f(e)
2682 * or -a(p,i) - j = g f(e) if sign < 0,
2683 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2684 * a(p,i) is assumed to be an expression in only the parameters.
2686 static void extract_stride(__isl_keep isl_constraint *c,
2687 struct cuda_array_bound *bound, isl_int stride, int sign)
2689 int i;
2690 isl_int v;
2691 isl_space *dim;
2692 unsigned nparam;
2693 isl_aff *aff;
2695 isl_int_set(bound->stride, stride);
2697 dim = isl_constraint_get_space(c);
2698 dim = isl_space_params(dim);
2700 nparam = isl_space_dim(dim, isl_dim_param);
2702 isl_int_init(v);
2704 isl_constraint_get_constant(c, &v);
2705 if (sign < 0)
2706 isl_int_neg(v, v);
2707 aff = isl_aff_zero_on_domain(isl_local_space_from_space(dim));
2708 aff = isl_aff_set_constant(aff, v);
2710 for (i = 0; i < nparam; ++i) {
2711 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2712 if (isl_int_is_zero(v))
2713 continue;
2714 if (sign < 0)
2715 isl_int_neg(v, v);
2716 aff = isl_aff_add_coefficient(aff, isl_dim_param, i, v);
2719 isl_int_clear(v);
2721 bound->shift = aff;
2724 /* Given an equality constraint of a map with a single output dimension j,
2725 * check if the constraint is of the form
2727 * a(p,i) + j = g f(e)
2729 * with a(p,i) an expression in the parameters and input dimensions
2730 * and f(e) an expression in the existentially quantified variables.
2731 * If so, and if g is larger than any such g from a previously considered
2732 * constraint, then call extract_stride. to record the stride information
2733 * in bound.
2735 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2737 int i;
2738 isl_int v, stride;
2739 unsigned n_div;
2740 struct cuda_array_bound *bound = user;
2742 isl_int_init(v);
2743 isl_int_init(stride);
2745 n_div = isl_constraint_dim(c, isl_dim_div);
2746 isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2748 if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2749 int s = isl_int_sgn(v);
2750 isl_int_set_si(stride, 0);
2751 for (i = 0; i < n_div; ++i) {
2752 isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2753 isl_int_gcd(stride, stride, v);
2755 if (!isl_int_is_zero(stride) &&
2756 isl_int_gt(stride, bound->stride))
2757 extract_stride(c, bound, stride, s);
2760 isl_int_clear(stride);
2761 isl_int_clear(v);
2763 isl_constraint_free(c);
2764 return 0;
2767 /* Given contraints on an array index i, check if we can find
2768 * a shift a(p) and a stride g such that
2770 * a(p) + i = 0 mod g
2772 * If so, record the information in bound and apply the mapping
2773 * i -> (i + a(p))/g to the array index in bounds and return
2774 * the new constraints.
2775 * If not, simply return the original constraints.
2777 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2778 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2780 isl_basic_map *aff;
2781 isl_basic_map *shift;
2782 isl_aff *aff_shift;
2784 isl_int_set_si(bound->stride, -1);
2786 aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2788 isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2790 isl_basic_map_free(aff);
2792 if (isl_int_is_neg(bound->stride))
2793 return bounds;
2795 aff_shift = isl_aff_copy(bound->shift);
2796 aff_shift = isl_aff_add_dims(aff_shift, isl_dim_in, 1);
2797 aff_shift = isl_aff_add_coefficient_si(aff_shift, isl_dim_in, 0, 1);
2798 aff_shift = isl_aff_scale_down(aff_shift, bound->stride);
2799 shift = isl_basic_map_from_aff(aff_shift);
2801 bound->shift_map = isl_basic_map_copy(shift);
2802 bounds = isl_basic_map_apply_range(bounds, shift);
2804 return bounds;
2807 struct cuda_size_info {
2808 isl_basic_set *bset;
2809 struct cuda_array_bound *bound;
2810 int pos;
2813 /* Given a constraint from the basic set describing the bounds on
2814 * an array index, check if it is a lower bound, say m i >= b(x), and,
2815 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2816 * upper bound. If so, and if this bound is smaller than any bound
2817 * derived from earlier constraints, set the size to this bound on
2818 * the expression and the lower bound to ceil(b(x)/m).
2820 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2822 struct cuda_size_info *size = user;
2823 unsigned nparam;
2824 unsigned n_div;
2825 isl_int v;
2827 nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2828 n_div = isl_constraint_dim(c, isl_dim_div);
2830 if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2831 isl_constraint_free(c);
2832 return 0;
2835 isl_int_init(v);
2837 isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2839 if (isl_int_is_pos(v)) {
2840 isl_aff *aff;
2841 isl_aff *lb;
2842 enum isl_lp_result res;
2844 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2845 aff = isl_aff_ceil(aff);
2847 lb = isl_aff_copy(aff);
2849 aff = isl_aff_neg(aff);
2850 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2852 res = isl_basic_set_max(size->bset, aff, &v);
2853 isl_aff_free(aff);
2855 if (res == isl_lp_ok) {
2856 isl_int_add_ui(v, v, 1);
2857 if (isl_int_is_neg(size->bound->size) ||
2858 isl_int_lt(v, size->bound->size)) {
2859 isl_int_set(size->bound->size, v);
2860 lb = isl_aff_drop_dims(lb, isl_dim_in,
2861 0, size->pos + 1);
2862 isl_aff_free(size->bound->lb);
2863 size->bound->lb = isl_aff_copy(lb);
2866 isl_aff_free(lb);
2869 isl_int_clear(v);
2870 isl_constraint_free(c);
2872 return 0;
2875 /* Given a basic map "bounds" that maps parameters and input dimensions
2876 * to a single output dimension, look for an expression in the parameters
2877 * and input dimensions such that the range of the output dimension shifted
2878 * by this expression is a constant.
2880 * In particular, we currently only consider lower bounds on the output
2881 * dimension as candidate expressions.
2883 static int compute_array_dim_size(struct cuda_gen *gen,
2884 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2886 struct cuda_size_info size;
2888 bounds = isl_basic_map_detect_equalities(bounds);
2889 bounds = check_stride(gen, bound, bounds);
2891 isl_int_set_si(bound->size, -1);
2892 bound->lb = NULL;
2894 size.bound = bound;
2895 size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2896 size.bset = isl_basic_map_wrap(bounds);
2897 size.bset = isl_basic_set_flatten(size.bset);
2898 size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2899 isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2900 &size);
2901 isl_basic_set_free(size.bset);
2903 return isl_int_is_nonneg(bound->size) ? 0 : -1;
2906 /* Check if we can find a shared memory tile for the given array
2907 * based on the given accesses, and if so, put the results
2908 * in array->shared_bound.
2910 * We project the accesses on each index in turn and look for a parametric
2911 * offset such that the size is constant.
2913 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2914 struct cuda_array_info *array, __isl_keep isl_map *access,
2915 struct cuda_array_bound *bounds)
2917 int i;
2919 for (i = 0; i < array->n_index; ++i) {
2920 isl_map *access_i;
2921 isl_basic_map *hull;
2923 access_i = isl_map_copy(access);
2924 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2925 access_i = isl_map_project_out(access_i, isl_dim_out,
2926 1, array->n_index - (i + 1));
2927 access_i = isl_map_compute_divs(access_i);
2928 hull = isl_map_simple_hull(access_i);
2929 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2930 return 0;
2933 return 1;
2936 /* Construct a map with input the shared tile loops and the loops that
2937 * will be wrapped around the threads that relates these later loops
2938 * to the thread indices and then projects them out.
2940 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2942 isl_map *priv;
2943 isl_map *tiling;
2944 isl_map *proj;
2945 isl_set *par;
2946 isl_space *dim;
2948 dim = isl_union_map_get_space(gen->shared_sched);
2950 if (gen->options->wrap)
2951 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2952 gen->shared_len, gen->n_block, gen->block_dim);
2953 else
2954 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2955 gen->shared_len, gen->n_block, gen->block_dim);
2957 priv = tiling;
2959 par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2960 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2961 gen->n_block, "t");
2963 priv = isl_map_align_params(priv, isl_set_get_space(par));
2964 priv = isl_map_intersect_range(priv, par);
2966 dim = isl_map_get_space(priv);
2967 dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2968 dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2969 proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2970 gen->shared_len);
2972 priv = isl_map_apply_range(priv, proj);
2974 return priv;
2977 /* Construct a map from domain_dim to domain_dim that increments
2978 * the dimension at position "pos" and leaves all other dimensions
2979 * constant.
2981 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2983 int i;
2984 int len = isl_space_dim(domain_dim, isl_dim_set);
2985 isl_space *dim;
2986 isl_basic_map *next;
2987 isl_local_space *ls;
2989 dim = isl_space_map_from_set(domain_dim);
2990 next = isl_basic_map_universe(isl_space_copy(dim));
2991 ls = isl_local_space_from_space(dim);
2993 for (i = 0; i < len; ++i) {
2994 isl_constraint *c;
2996 c = isl_equality_alloc(isl_local_space_copy(ls));
2997 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2998 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2999 if (i == pos)
3000 isl_constraint_set_constant_si(c, 1);
3001 next = isl_basic_map_add_constraint(next, c);
3004 isl_local_space_free(ls);
3006 return isl_map_from_basic_map(next);
3009 /* Check if the given access is coalesced.
3010 * That is, check whether incrementing the dimension that will get
3011 * wrapped over the last thread index results in incrementing
3012 * the last array index.
3014 * This function is only called for access relations without reuse.
3016 static int access_is_coalesced(struct cuda_gen *gen,
3017 __isl_keep isl_union_map *access)
3019 isl_space *dim;
3020 isl_map *access_map;
3021 isl_map *next_thread_x;
3022 isl_map *next_element;
3023 isl_map *map;
3024 int coalesced;
3026 access = isl_union_map_copy(access);
3027 access = isl_union_map_apply_domain(access,
3028 isl_union_map_copy(gen->tiled_sched));
3029 access_map = isl_map_from_union_map(access);
3031 dim = isl_map_get_space(access_map);
3032 dim = isl_space_domain(dim);
3033 next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
3035 dim = isl_map_get_space(access_map);
3036 dim = isl_space_range(dim);
3037 next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
3039 map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
3040 map = isl_map_apply_range(map, access_map);
3042 coalesced = isl_map_is_subset(map, next_element);
3044 isl_map_free(next_element);
3045 isl_map_free(map);
3047 return coalesced;
3050 /* For the given array reference group, check whether the access is private
3051 * to the thread. That is, check that any given array element
3052 * is only accessed by a single thread.
3053 * We compute an access relation that maps the shared tile loop iterators
3054 * and the shared point loop iterators that will be wrapped over the
3055 * threads to the array elements.
3056 * We actually check that those iterators that will be wrapped
3057 * partition the array space. This check is stricter than necessary
3058 * since several iterations may be mapped onto the same thread
3059 * and then they could be allowed to access the same memory elements,
3060 * but our check does not allow this situation.
3062 * We also check that the index expression only depends on parallel
3063 * loops. That way, we can move those loops innermost and unroll them.
3064 * Again, we use a test that is stricter than necessary.
3065 * We actually check whether the index expression only depends
3066 * on the iterators that are wrapped over the threads.
3067 * These are necessarily parallel, but there may be more parallel loops.
3069 * Combining the injectivity of the first test with the single-valuedness
3070 * of the second test, we simply test for bijectivity.
3072 * If it turns out we can use registers, we compute the private memory
3073 * tile size using can_tile_for_shared_memory, after introducing a dependence
3074 * on the thread indices.
3076 * Before performing any of the above computations, we first check
3077 * if there is any reuse on the reference group. If not, we simply
3078 * return. If, moreover, the access is coalesced then we also remove
3079 * the shared memory tiling since we should just use global memory instead.
3081 static void check_private_group_access(struct cuda_gen *gen,
3082 struct cuda_array_ref_group *group)
3084 isl_map *acc;
3085 isl_union_map *access;
3086 int n_index = group->array->n_index;
3088 access = group_access_relation(group, 1, 1);
3089 if (isl_union_map_is_injective(access)) {
3090 if (group->shared_bound && access_is_coalesced(gen, access)) {
3091 free_bound_list(group->shared_bound, n_index);
3092 group->shared_bound = NULL;
3094 isl_union_map_free(access);
3095 return;
3097 access = isl_union_map_apply_domain(access,
3098 isl_union_map_copy(gen->shared_sched));
3100 acc = isl_map_from_union_map(access);
3102 if (!isl_map_is_bijective(acc)) {
3103 isl_map_free(acc);
3104 return;
3107 group->private_bound = create_bound_list(gen->ctx, n_index);
3108 acc = isl_map_align_params(acc, isl_map_get_space(gen->privatization));
3109 acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
3110 if (!can_tile_for_shared_memory(gen, group->array, acc,
3111 group->private_bound)) {
3112 free_bound_list(group->private_bound, n_index);
3113 group->private_bound = NULL;
3116 isl_map_free(acc);
3119 /* Look for the last shared tile loop that affects the offset of the
3120 * shared or private tile and store the result in array->last_shared.
3122 static void set_last_shared(struct cuda_gen *gen,
3123 struct cuda_array_ref_group *group)
3125 int i, j;
3126 struct cuda_array_bound *bounds;
3127 unsigned first_shared = gen->first_shared;
3128 int n_index = group->array->n_index;
3130 bounds = group->private_bound;
3131 if (!bounds)
3132 bounds = group->shared_bound;
3133 if (!bounds)
3134 return;
3136 for (j = gen->shared_len - 1; j >= 0; --j) {
3137 for (i = 0; i < n_index; ++i) {
3138 isl_aff *lb;
3139 isl_aff *shift;
3141 lb = bounds[i].lb;
3142 if (isl_aff_involves_dims(lb, isl_dim_param,
3143 first_shared + j, 1))
3144 break;
3146 shift = bounds[i].shift;
3147 if (!shift)
3148 continue;
3149 if (isl_aff_involves_dims(shift, isl_dim_param,
3150 first_shared + j, 1))
3151 break;
3153 if (i < n_index)
3154 break;
3156 group->last_shared = j;
3159 /* Compute the sizes of all private arrays for the current kernel,
3160 * as well as the offsets of the private pieces in the original arrays.
3161 * If we cannot or don't want to privatize a given array group,
3162 * we use the shared memory tile sizes computed in
3163 * compute_group_shared_bound instead.
3165 * If we have been able to find a private or shared tile,
3166 * we also look for the last shared tile loop that affects the offset
3167 * (and therefore the group tile) and store the result in group->last_shared.
3169 * A privatized copy of all access relations from reference groups that
3170 * are mapped to private memory is stored in gen->privatization.
3172 static void compute_private_size(struct cuda_gen *gen)
3174 int i, j;
3175 isl_union_map *private;
3177 if (!gen->options->use_private_memory)
3178 return;
3180 private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
3182 for (i = 0; i < gen->n_array; ++i) {
3183 struct cuda_array_info *array = &gen->array[i];
3185 for (j = 0; j < array->n_group; ++j) {
3186 check_private_group_access(gen, array->groups[j]);
3188 if (!array->groups[j]->private_bound)
3189 continue;
3191 private = isl_union_map_union(private,
3192 group_access_relation(array->groups[j], 1, 1));
3195 for (j = 0; j < array->n_group; ++j) {
3196 array->groups[j]->last_shared = gen->shared_len - 1;
3197 array->groups[j]->print_shared_level = -1;
3198 set_last_shared(gen, array->groups[j]);
3202 if (isl_union_map_is_empty(private))
3203 isl_union_map_free(private);
3204 else {
3205 isl_union_map *priv;
3207 private = isl_union_map_apply_domain(private,
3208 isl_union_map_copy(gen->shared_sched));
3209 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3210 private = isl_union_map_apply_domain(private, priv);
3211 gen->private_access = private;
3215 /* Compute the size of the tile specified by the list "bound" of n_index
3216 * cuda_array_bounds in number of elements and put the result in *size.
3218 static void tile_size(unsigned n_index, struct cuda_array_bound *bound,
3219 isl_int *size)
3221 int i;
3223 isl_int_set_si(*size, 1);
3225 for (i = 0; i < n_index; ++i)
3226 isl_int_mul(*size, *size, bound[i].size);
3229 /* If max_shared_memory is not set to infinity (-1), then make
3230 * sure that the total amount of shared memory required by the
3231 * array reference groups mapped to shared memory is no larger
3232 * than this maximum.
3234 * We apply a greedy approach and discard (keep in global memory)
3235 * those groups that would result in a total memory size that
3236 * is larger than the maximum.
3238 static void check_shared_memory_bound(struct cuda_gen *gen)
3240 int i, j;
3241 isl_int left, size;
3243 if (gen->options->max_shared_memory < 0)
3244 return;
3246 isl_int_init(left);
3247 isl_int_init(size);
3248 isl_int_set_si(left, gen->options->max_shared_memory);
3250 for (i = 0; i < gen->n_array; ++i) {
3251 struct cuda_array_info *array = &gen->array[i];
3253 for (j = 0; j < array->n_group; ++j) {
3254 struct cuda_array_ref_group *group;
3256 group = array->groups[j];
3257 if (!group->shared_bound)
3258 continue;
3260 tile_size(array->n_index, group->shared_bound, &size);
3261 isl_int_mul_ui(size, size, array->size);
3263 if (isl_int_le(size, left)) {
3264 isl_int_sub(left, left, size);
3265 continue;
3268 free_bound_list(group->shared_bound, array->n_index);
3269 group->shared_bound = NULL;
3273 isl_int_clear(size);
3274 isl_int_clear(left);
3277 /* Fill up the groups array with singleton groups, i.e., one group
3278 * per reference, initializing the array, access, write and refs fields.
3279 * In particular the access field is initialized to the scheduled
3280 * access relation of the array reference.
3282 * Return the number of elements initialized, i.e., the number of
3283 * active references in the current kernel.
3285 static int populate_array_references(struct cuda_gen *gen,
3286 struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3287 struct cuda_array_ref_group **groups)
3289 int i;
3290 int n;
3291 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3293 n = 0;
3294 for (i = 0; i < array->n_ref; ++i) {
3295 isl_union_map *umap;
3296 isl_map *map;
3297 struct cuda_array_ref_group *group;
3298 struct cuda_stmt_access *access = array->refs[i];
3300 map = isl_map_copy(access->access);
3301 umap = isl_union_map_from_map(map);
3302 umap = isl_union_map_apply_domain(umap,
3303 isl_union_map_copy(sched));
3305 if (isl_union_map_is_empty(umap)) {
3306 isl_union_map_free(umap);
3307 continue;
3310 map = isl_map_from_union_map(umap);
3311 map = isl_map_detect_equalities(map);
3313 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3314 assert(group);
3315 group->array = array;
3316 group->access = map;
3317 group->write = access->write;
3318 group->refs = &array->refs[i];
3320 groups[n++] = group;
3323 return n;
3326 static void free_array_ref_group(struct cuda_array_ref_group *group,
3327 int n_index)
3329 if (!group)
3330 return;
3331 free_bound_list(group->shared_bound, n_index);
3332 free_bound_list(group->private_bound, n_index);
3333 isl_map_free(group->access);
3334 free(group->refs);
3335 free(group);
3338 /* Given a set where the parameters gen->first_shared up to
3339 * gen->first_shared + gen->shared_len represent the tile loops,
3340 * eliminate the innermost of those that have a fixed value
3341 * until we reach one that does not (obviously) have a fixed value.
3343 static __isl_give isl_set *eliminate_fixed_inner_loops(struct cuda_gen *gen,
3344 __isl_take isl_set *access)
3346 int i;
3348 for (i = gen->shared_len - 1; i >= 0; --i) {
3349 int pos = gen->first_shared + i;
3350 if (!isl_set_plain_is_fixed(access, isl_dim_param, pos, NULL))
3351 break;
3352 access = isl_set_eliminate(access, isl_dim_param, pos, 1);
3354 return access;
3357 /* Check if the accessed set of group1 and group2 overlap within
3358 * the innermost loop. In particular, ignore any inner dimension
3359 * with a fixed value.
3360 * The copying to and from shared memory will be performed within
3361 * the innermost actual loop so we are only allowed to consider
3362 * the dimensions up to that innermost loop while checking whether
3363 * two access sets overlap.
3365 static int accesses_overlap(struct cuda_gen *gen,
3366 struct cuda_array_ref_group *group1,
3367 struct cuda_array_ref_group *group2)
3369 int empty;
3370 isl_set *access1, *access2;
3372 access1 = isl_map_range(isl_map_copy(group1->access));
3373 access1 = eliminate_fixed_inner_loops(gen, access1);
3374 access2 = isl_map_range(isl_map_copy(group2->access));
3375 access2 = eliminate_fixed_inner_loops(gen, access2);
3376 access1 = isl_set_intersect(access1, access2);
3377 empty = isl_set_is_empty(access1);
3378 isl_set_free(access1);
3380 return !empty;
3383 /* If two groups have overlapping access relations (within the innermost
3384 * loop) and if one of them involves a write, then merge the two groups
3385 * into one.
3387 * We keep track of the grouping in "leader". leader[j] points to
3388 * an earlier group array element that belongs to the same group,
3389 * or the array element j itself if this element is the first in the group.
3391 * Return the number of group leaders.
3393 static int group_overlapping_writes(struct cuda_gen *gen, int n,
3394 struct cuda_array_ref_group **groups, int *leader)
3396 int i, j;
3397 int n_group = n;
3399 for (i = 0; i < n; ++i) {
3400 int l = i;
3401 groups[l]->n_ref = 1;
3402 for (j = i - 1; j >= 0; --j) {
3403 if (leader[j] != j)
3404 continue;
3405 if (!groups[l]->write && !groups[j]->write)
3406 continue;
3408 if (!accesses_overlap(gen, groups[l], groups[j]))
3409 continue;
3411 groups[j]->access = isl_map_union(groups[j]->access,
3412 groups[l]->access);
3413 groups[j]->write = 1;
3414 groups[l]->access = NULL;
3415 groups[j]->n_ref += groups[l]->n_ref;
3416 l = leader[l] = j;
3417 n_group--;
3419 leader[i] = l;
3422 return n_group;
3425 /* Compute the size of the shared array corresponding to the given array
3426 * array refrence group, based on the accesses from the current kernel,
3427 * as well as the offset of the shared piece in the original array.
3429 static void compute_group_shared_bound(struct cuda_gen *gen,
3430 struct cuda_array_info *array, struct cuda_array_ref_group *group)
3432 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3434 if (!gen->options->use_shared_memory)
3435 return;
3436 if (cuda_array_is_read_only_scalar(array))
3437 return;
3439 group->shared_bound = create_bound_list(ctx, array->n_index);
3440 if (!can_tile_for_shared_memory(gen, array, group->access,
3441 group->shared_bound)) {
3442 free_bound_list(group->shared_bound, array->n_index);
3443 group->shared_bound = NULL;
3447 /* Is the size of the tile specified by "bound" smaller than the sum of
3448 * the sizes of the tiles specified by "bound1" and "bound2"?
3450 static int smaller_tile(unsigned n_index, struct cuda_array_bound *bound,
3451 struct cuda_array_bound *bound1, struct cuda_array_bound *bound2)
3453 int smaller;
3454 isl_int size, size1, size2;
3456 isl_int_init(size);
3457 isl_int_init(size1);
3458 isl_int_init(size2);
3460 tile_size(n_index, bound, &size);
3461 tile_size(n_index, bound1, &size1);
3462 tile_size(n_index, bound2, &size2);
3464 isl_int_sub(size, size, size1);
3465 isl_int_sub(size, size, size2);
3466 smaller = isl_int_is_neg(size);
3468 isl_int_clear(size2);
3469 isl_int_clear(size1);
3470 isl_int_clear(size);
3472 return smaller;
3475 /* Given an initial grouping of array references and shared memory tiles
3476 * for each group that allows for a shared memory tile, merge two groups
3477 * if both have a shared memory tile, the merged group also has
3478 * a shared memory tile and the size of the tile for the merge group
3479 * is smaller than the sum of the tile sizes of the individual groups.
3481 * Return the number of group leaders after merging.
3483 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3484 struct cuda_array_info *array, int n,
3485 struct cuda_array_ref_group **groups, int *leader, int n_group)
3487 int i, j;
3488 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3490 for (i = 0; n_group > 1 && i < n; ++i) {
3491 int l = i;
3492 if (leader[i] != i)
3493 continue;
3494 if (!groups[i]->shared_bound)
3495 continue;
3496 for (j = i - 1; j >= 0; --j) {
3497 isl_map *map;
3498 int empty;
3499 struct cuda_array_bound *shared_bound;
3501 if (leader[j] != j)
3502 continue;
3503 if (!groups[j]->shared_bound)
3504 continue;
3506 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3507 isl_map_copy(groups[j]->access));
3508 empty = isl_map_is_empty(map);
3509 isl_map_free(map);
3511 if (empty)
3512 continue;
3514 map = isl_map_union(isl_map_copy(groups[l]->access),
3515 isl_map_copy(groups[j]->access));
3516 shared_bound = create_bound_list(ctx, array->n_index);
3517 if (!can_tile_for_shared_memory(gen, array, map,
3518 shared_bound) ||
3519 !smaller_tile(array->n_index, shared_bound,
3520 groups[l]->shared_bound,
3521 groups[j]->shared_bound)) {
3522 isl_map_free(map);
3523 free_bound_list(shared_bound, array->n_index);
3524 continue;
3527 free_bound_list(groups[j]->shared_bound,
3528 array->n_index);
3529 groups[j]->shared_bound = shared_bound;
3530 isl_map_free(groups[j]->access);
3531 groups[j]->access = map;
3532 groups[j]->n_ref += groups[l]->n_ref;
3533 l = leader[l] = j;
3534 n_group--;
3538 return n_group;
3541 /* Extract an array of array reference groups from the array of references
3542 * and the grouping information in "leader".
3544 * Store the results in array->n_group and array->groups.
3546 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3547 int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3549 int i, j;
3551 for (i = 2; i < n; ++i)
3552 leader[i] = leader[leader[i]];
3554 array->n_group = n_group;
3555 array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3556 n_group);
3557 assert(array->groups);
3559 j = 0;
3560 for (i = 0; i < n; ++i) {
3561 int k, l;
3562 struct cuda_stmt_access **refs;
3564 if (leader[i] != i) {
3565 groups[i]->refs = NULL;
3566 free_array_ref_group(groups[i], array->n_index);
3567 continue;
3570 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3571 groups[i]->n_ref);
3572 assert(refs);
3573 l = 0;
3574 for (k = i; k < n; ++k)
3575 if (leader[k] == i) {
3576 refs[l++] = *groups[k]->refs;
3577 (*groups[k]->refs)->group = j;
3580 groups[i]->refs = refs;
3581 groups[i]->nr = j;
3582 array->groups[j++] = groups[i];
3586 /* Group array references that should be considered together when
3587 * deciding whether to access them from private, shared or global memory.
3589 * In particular, if two array references overlap and if one of them
3590 * is a write, then the two references are grouped together.
3591 * Furthermore, if two groups admit a shared memory tile and if the
3592 * combination of the two also admits a shared memory tile, we merge
3593 * the two groups.
3595 * During the construction the group->refs field points to a single
3596 * array reference inside the array of array references, while
3597 * group->n_ref contains the number of element in leader that
3598 * (directly or indirectly) point to this group, provided the group
3599 * is a leader.
3601 static void group_array_references(struct cuda_gen *gen,
3602 struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3604 int i;
3605 int n, n_group;
3606 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3607 struct cuda_array_ref_group **groups;
3608 int *leader;
3610 groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3611 array->n_ref);
3612 assert(groups);
3614 n = populate_array_references(gen, array, sched, groups);
3616 leader = isl_alloc_array(ctx, int, n);
3617 assert(leader);
3619 n_group = group_overlapping_writes(gen, n, groups, leader);
3621 for (i = 0; i < n; ++i)
3622 if (leader[i] == i)
3623 compute_group_shared_bound(gen, array, groups[i]);
3625 n_group = group_common_shared_memory_tile(gen, array, n, groups,
3626 leader, n_group);
3628 extract_array_groups(ctx, array, n, groups, leader, n_group);
3630 free(leader);
3631 free(groups);
3634 /* Take tiled_sched, project it onto the shared tile loops and
3635 * the loops that will be wrapped over the threads,
3636 * parametrize the shared tile loops and store the result in gen->shared_sched.
3637 * The position of the first of these parameters is stored in gen->first_shared.
3638 * Also compute a projection that projects out the loops that will be
3639 * wrapped over the threads and store this projection in gen->shared_proj.
3641 static void compute_shared_sched(struct cuda_gen *gen)
3643 isl_space *dim;
3644 isl_map *proj;
3645 isl_set *par;
3646 isl_union_map *sched;
3648 sched = isl_union_map_copy(gen->tiled_sched);
3650 dim = isl_union_map_get_space(sched);
3651 gen->first_shared = isl_space_dim(dim, isl_dim_param);
3652 proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3653 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3655 dim = isl_union_map_get_space(sched);
3656 par = parametrization(dim, gen->shared_len + gen->n_block,
3657 0, gen->shared_len, "g");
3658 sched = isl_union_map_intersect_range(sched,
3659 isl_union_set_from_set(par));
3661 dim = isl_union_map_get_space(sched);
3662 proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3664 gen->shared_sched = sched;
3665 gen->shared_proj = isl_union_map_from_map(proj);
3668 /* Group references of all arrays in the program.
3670 static void group_references(struct cuda_gen *gen)
3672 int i;
3673 isl_union_map *sched;
3675 sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3676 isl_union_map_copy(gen->shared_proj));
3678 for (i = 0; i < gen->n_array; ++i)
3679 group_array_references(gen, &gen->array[i], sched);
3681 isl_union_map_free(sched);
3684 /* Free all array information that is local to the current kernel.
3686 static void free_local_array_info(struct cuda_gen *gen)
3688 int i, j;
3690 for (i = 0; i < gen->n_array; ++i) {
3691 struct cuda_array_info *array = &gen->array[i];
3693 for (j = 0; j < array->n_group; ++j)
3694 free_array_ref_group(array->groups[j], array->n_index);
3695 free(array->groups);
3697 if (array->n_group == 0)
3698 continue;
3699 for (j = 0; j < gen->array[i].n_index; ++j) {
3700 isl_pw_aff_free(gen->array[i].local_bound[j]);
3701 gen->array[i].local_bound[j] = NULL;
3706 /* The sizes of the arrays on the host that have been computed by
3707 * extract_array_info may depend on the parameters. Use the extra
3708 * constraints on the parameters that are valid at "host_domain"
3709 * to simplify these expressions.
3711 static void localize_bounds(struct cuda_gen *gen,
3712 __isl_keep isl_set *host_domain)
3714 int i, j;
3715 isl_set *context;
3717 context = isl_set_copy(host_domain);
3718 context = isl_set_params(host_domain);
3720 for (i = 0; i < gen->n_array; ++i) {
3721 struct cuda_array_info *array = &gen->array[i];
3723 if (array->n_group == 0)
3724 continue;
3726 for (j = 0; j < array->n_index; ++j) {
3727 isl_pw_aff *pwaff;
3729 pwaff = isl_pw_aff_copy(array->bound[j]);
3730 pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3731 array->local_bound[j] = pwaff;
3734 isl_set_free(context);
3737 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3738 * in the statement list u.
3739 * Because of the way the schedule is constructed, the other statements
3740 * in the list, if any, should have the same values for these properties.
3742 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3744 int nr;
3745 struct cuda_stmt *stmt;
3747 nr = atoi(u->statement->name + 2);
3748 stmt = &gen->stmts[nr];
3750 gen->tile_len = stmt->tile_len;
3751 gen->n_parallel = stmt->n_parallel;
3754 /* Extract a description of the grid, i.e., the possible values
3755 * of the block ids, from gen->tiled_sched.
3756 * The block ids are parameters in gen->tiled_sched.
3757 * We simply need to change them into set dimensions.
3759 static __isl_give isl_set *extract_grid(struct cuda_gen *gen)
3761 int i;
3762 isl_set *grid;
3764 grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3765 grid = isl_set_from_params(grid);
3766 grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3767 for (i = 0; i < gen->n_grid; ++i) {
3768 int pos;
3769 char name[20];
3771 snprintf(name, sizeof(name), "b%d", i);
3772 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3773 assert(pos >= 0);
3774 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3775 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3778 return grid;
3781 /* Print the effective grid size as a list of the sizes in each
3782 * dimension, from innermost to outermost.
3784 * The grid size specified by the user or set by default
3785 * in read_grid_sizes() and applied in tile_schedule(),
3786 * may be too large for the given code in the sense that
3787 * it may contain blocks that don't need to execute anything.
3788 * We therefore don't print this grid size, but instead the
3789 * smallest grid size that ensures that all blocks that actually
3790 * execute code are included in the grid.
3792 * For each block dimension, we compute the maximal value of the block id
3793 * and add one.
3795 static void print_grid_size(struct cuda_gen *gen, __isl_take isl_set *context)
3797 int i;
3798 isl_printer *prn;
3799 isl_set *grid;
3801 if (gen->n_grid == 0) {
3802 isl_set_free(context);
3803 return;
3806 grid = extract_grid(gen);
3808 prn = isl_printer_to_file(gen->ctx, gen->cuda.host_c);
3809 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3811 prn = isl_printer_print_str(prn, "(");
3812 for (i = gen->n_grid - 1; i >= 0; --i) {
3813 isl_space *space;
3814 isl_aff *one;
3815 isl_pw_aff *bound = isl_set_dim_max(isl_set_copy(grid), i);
3817 bound = isl_pw_aff_coalesce(bound);
3818 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3820 space = isl_pw_aff_get_domain_space(bound);
3821 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3822 one = isl_aff_add_constant_si(one, 1);
3823 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3824 prn = isl_printer_print_pw_aff(prn, bound);
3825 isl_pw_aff_free(bound);
3827 if (i > 0)
3828 prn = isl_printer_print_str(prn, ", ");
3830 prn = isl_printer_print_str(prn, ")");
3832 isl_printer_free(prn);
3833 isl_set_free(grid);
3834 isl_set_free(context);
3837 /* This function is called for each leaf in the clast of the host code.
3838 * We first specialize the schedule to the site of the leaf, compute
3839 * the size of shared memory and then print the body of host code
3840 * and the associated kernel (through a call to print_kernel_body).
3842 static void print_host_user(struct clast_printer_info *code,
3843 struct clast_user_stmt *u)
3845 struct cuda_gen *gen = code->user;
3846 isl_space *dim;
3847 isl_set *par;
3848 isl_set *host_domain;
3849 isl_union_map *access;
3850 isl_union_map *local_sched;
3851 isl_union_set *arrays;
3853 set_tile_len(gen, u);
3854 read_sizes(gen);
3856 host_domain = extract_entire_host_domain(&u->stmt);
3858 local_sched = isl_union_map_intersect_range(
3859 isl_union_map_copy(gen->sched),
3860 isl_union_set_from_set(extend(isl_set_copy(host_domain),
3861 gen->untiled_len)));
3862 access = isl_union_map_union(isl_union_map_copy(gen->read),
3863 isl_union_map_copy(gen->write));
3864 access = isl_union_map_apply_domain(access,
3865 isl_union_map_copy(local_sched));
3866 arrays = isl_union_map_range(access);
3868 print_indent(code->dst, code->indent);
3869 fprintf(code->dst, "dim3 k%d_dimBlock", gen->kernel_id);
3870 print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3871 fprintf(code->dst, ";\n");
3873 gen->tiled_sched = tile_schedule(gen, local_sched);
3874 gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3875 gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3877 print_indent(code->dst, code->indent);
3878 fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
3879 print_grid_size(gen, isl_set_params(isl_set_copy(host_domain)));
3880 fprintf(code->dst, ";\n");
3882 gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3884 dim = isl_union_map_get_space(gen->local_sched);
3885 par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3886 gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3887 isl_union_set_from_set(par));
3889 gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3890 gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3892 gen->private_access = NULL;
3893 compute_shared_sched(gen);
3894 gen->privatization = compute_privatization(gen);
3895 group_references(gen);
3896 compute_private_size(gen);
3897 check_shared_memory_bound(gen);
3898 localize_bounds(gen, host_domain);
3900 gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3902 print_kernel_launch(gen, arrays);
3904 fprintf(gen->cuda.kernel_c, "{\n");
3906 print_kernel_body(gen, host_domain, gen->tiled_sched);
3908 fprintf(gen->cuda.kernel_c, "}\n");
3910 free_local_array_info(gen);
3911 isl_map_free(gen->privatization);
3912 isl_union_map_free(gen->private_access);
3913 isl_union_map_free(gen->local_sched);
3914 isl_union_map_free(gen->tiled_sched);
3915 isl_union_map_free(gen->shared_sched);
3916 isl_union_map_free(gen->shared_proj);
3917 isl_union_set_free(arrays);
3918 isl_set_free(host_domain);
3920 free(gen->tile_size);
3921 gen->kernel_id++;
3924 /* Use CLooG to generate code for the outer gen->tile_first loops
3925 * of the global schedule in gen->sched.
3926 * The pretty printing of this code is handled by print_clast,
3927 * which calls print_host_user for each kernel invocation location.
3929 static void print_cloog_host_code(struct cuda_gen *gen)
3931 int i;
3932 isl_set *context;
3933 isl_union_map *sched;
3934 CloogOptions *options;
3935 CloogDomain *cloog_context;
3936 CloogUnionDomain *ud;
3937 CloogInput *input;
3938 struct clast_stmt *stmt;
3939 char name[20];
3941 options = cloog_options_malloc(gen->state);
3942 options->language = CLOOG_LANGUAGE_C;
3943 options->otl = 0;
3944 options->strides = 1;
3945 options->stop = gen->tile_first;
3946 options->f = gen->untiled_len;
3947 options->l = gen->untiled_len;
3948 options->save_domains = 1;
3949 options->noscalars = 1;
3951 sched = isl_union_map_copy(gen->sched);
3952 ud = cloog_union_domain_from_isl_union_map(sched);
3953 for (i = 0; i < options->stop; ++i) {
3954 snprintf(name, sizeof(name), "h%d", i);
3955 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3957 context = isl_set_copy(gen->context);
3958 cloog_context = cloog_domain_from_isl_set(context);
3959 input = cloog_input_alloc(cloog_context, ud);
3961 stmt = cloog_clast_create_from_input(input, options);
3963 gen->code.indent = 0;
3964 gen->code.dst = gen->cuda.host_c;
3965 gen->code.print_user_stmt = NULL;
3966 gen->code.print_user_stmt_list = &print_host_user;
3967 gen->code.print_for_head = NULL;
3968 gen->code.print_for_foot = NULL;
3969 gen->code.user = gen;
3970 print_clast(&gen->code, stmt);
3972 cloog_clast_free(stmt);
3973 cloog_options_free(options);
3974 fprintf(gen->cuda.host_c, "\n");
3977 void print_cuda_macros(struct cuda_gen *gen)
3979 const char *macros =
3980 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
3981 "#define cudaCheckKernel()"
3982 " assert(cudaGetLastError() == cudaSuccess)\n\n";
3983 fputs(macros, gen->cuda.host_c);
3986 void print_host_code(struct cuda_gen *gen)
3988 fprintf(gen->cuda.host_c, "{\n");
3989 print_cloog_macros(gen->cuda.host_c);
3990 print_cloog_macros(gen->cuda.kernel_c);
3992 print_cuda_macros(gen);
3994 declare_device_arrays(gen);
3996 allocate_device_arrays(gen);
3997 copy_arrays_to_device(gen);
3999 gen->kernel_id = 0;
4000 print_cloog_host_code(gen);
4002 copy_arrays_from_device(gen);
4003 free_device_arrays(gen);
4005 fprintf(gen->cuda.host_c, "}\n");
4008 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
4009 const char *str)
4011 isl_ctx *ctx;
4012 isl_set *context;
4014 if (!str)
4015 return set;
4017 ctx = isl_set_get_ctx(set);
4018 context = isl_set_read_from_str(ctx, str);
4019 context = isl_set_align_params(context, isl_set_get_space(set));
4020 set = isl_set_intersect(set, context);
4022 return set;
4025 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4027 if (!str)
4028 return NULL;
4029 return isl_union_map_read_from_str(ctx, str);
4032 /* Return the union of all iteration domains of the gen->stmts[i].
4034 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
4036 int i;
4037 isl_union_set *domain;
4039 domain = isl_union_set_empty(isl_set_get_space(gen->context));
4040 for (i = 0; i < gen->n_stmts; ++i) {
4041 isl_set *domain_i;
4043 domain_i = isl_set_copy(gen->stmts[i].domain);
4044 domain = isl_union_set_union(domain,
4045 isl_union_set_from_set(domain_i));
4048 return domain;
4051 /* Information about the outermost tilable bands in the forest of bands.
4053 * tile_len and n_parallel are only sets on band_info structures
4054 * that correspond to outermost bands. For other bands (in particular,
4055 * ancestors of the outermost bands), n_parallal is set to 0.
4057 * prefix is the (padded) schedule leading up to the outermost tilable bands.
4059 * tile_first is the number of schedule dimensions in prefix.
4061 * suffix is the schedule of the outermost tilable bands and their descendants.
4063 struct band_info {
4064 struct cuda_gen *gen;
4065 int tile_first;
4066 int tile_len;
4067 int n_parallel;
4068 isl_union_map *prefix;
4069 isl_union_map *suffix;
4072 /* Set tile_len and n_parallel of the statement to that of
4073 * their outermost band, recorded in the band_info.
4075 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4077 struct band_info *info = user;
4078 int nr;
4079 struct cuda_stmt *stmt;
4081 nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
4082 stmt = &info->gen->stmts[nr];
4084 stmt->tile_len = info->tile_len;
4085 stmt->n_parallel = info->n_parallel;
4087 isl_map_free(map);
4089 return 0;
4092 static void list_select_outer_band(struct cuda_gen *gen,
4093 __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4095 /* Check if this band has any parallel loops. If so, take it as
4096 * the outermost tilable band. If not, continue looking for the
4097 * outermost tilable band in the children of the current band.
4099 static void band_select_outer_band(struct cuda_gen *gen,
4100 __isl_take isl_band *band, int pos, struct band_info *info)
4102 int n = isl_band_n_member(band);
4103 int n_parallel;
4105 for (n_parallel = 0; n_parallel < n; ++n_parallel)
4106 if (!isl_band_member_is_zero_distance(band, n_parallel))
4107 break;
4109 info->n_parallel = n_parallel;
4110 if (n_parallel) {
4111 info->gen = gen;
4112 info->tile_first = pos;
4113 info->tile_len = n;
4114 info->prefix = isl_band_get_prefix_schedule(band);
4115 info->suffix = isl_union_map_flat_range_product(
4116 isl_band_get_partial_schedule(band),
4117 isl_band_get_suffix_schedule(band));
4118 isl_union_map_foreach_map(info->prefix,
4119 &set_stmt_tile_len, info);
4120 } else if (isl_band_has_children(band)) {
4121 isl_band_list *children;
4122 children = isl_band_get_children(band);
4123 list_select_outer_band(gen, children, pos + n, info);
4124 } else {
4125 info->gen = gen;
4126 info->tile_first = pos + n;
4127 info->tile_len = 0;
4128 info->prefix = isl_union_map_flat_range_product(
4129 isl_band_get_prefix_schedule(band),
4130 isl_band_get_partial_schedule(band));
4131 info->suffix = isl_band_get_suffix_schedule(band);
4132 isl_union_map_foreach_map(info->prefix,
4133 &set_stmt_tile_len, info);
4136 isl_band_free(band);
4139 /* Comparison function that returns a non-zero value for band_infos
4140 * with different tile_len fields or different n_parallel fields.
4142 static int cmp_band(const void *p1, const void *p2)
4144 const struct band_info *info1 = p1;
4145 const struct band_info *info2 = p2;
4147 if (info1->tile_len != info2->tile_len)
4148 return info1->tile_len - info2->tile_len;
4150 return info1->n_parallel - info2->n_parallel;
4153 /* Extend "umap" with coordinates with fixed value "val"
4154 * to a total length of "dst_len", assuming the original dimension is "src_len".
4156 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
4157 int src_len, int dst_len, int val)
4159 isl_space *dim;
4160 isl_map *map;
4161 int i;
4163 dim = isl_union_map_get_space(umap);
4164 map = isl_map_reverse(projection(dim, dst_len, src_len));
4165 for (i = src_len; i < dst_len; ++i)
4166 map = isl_map_fix_si(map, isl_dim_out, i, val);
4168 umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4170 return umap;
4173 /* Group bands with the same values for tile_len and n_parallel.
4174 * The prefix schedule is then extended with a fixed coordinate that
4175 * is different for each such group.
4176 * Note that the actual values for this coordinate are not important.
4177 * The bands have already been effectively separated at a higher level
4178 * or they are independent and may be executed in parallel.
4179 * The list of band_info has been sorted before this functions is called.
4181 static void separate_bands(struct band_info *info, int n)
4183 int i;
4184 int j = 0;
4186 for (i = 0; i < n; ++i) {
4187 int l = info[i].tile_first;
4189 if (i &&
4190 (info[i].tile_len != info[i - 1].tile_len ||
4191 info[i].n_parallel != info[i - 1].n_parallel))
4192 j++;
4194 info[i].prefix = extend_range(info[i].prefix,
4195 l, l + 1, j);
4196 info[i].tile_first = l + 1;
4200 /* Select the outermost bands in the elements of the list, align
4201 * their prefix schedules, separate bands with different values
4202 * for tile_len and/or n_parallel and then combine the resulting
4203 * prefix and suffix schedules into a single pair of prefix and
4204 * suffix schedules for the entire list.
4206 static void list_select_outer_band(struct cuda_gen *gen,
4207 __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4209 isl_band *band;
4210 int i;
4211 int n = isl_band_list_n_band(list);
4212 isl_ctx *ctx = isl_band_list_get_ctx(list);
4213 struct band_info *info;
4214 int max_tile_first;
4215 isl_union_map *prefix;
4216 isl_union_map *suffix;
4218 assert(n >= 1);
4219 info = isl_calloc_array(ctx, struct band_info, n);
4220 assert(info);
4222 max_tile_first = 0;
4223 for (i = 0; i < n; ++i) {
4224 band = isl_band_list_get_band(list, i);
4225 band_select_outer_band(gen, band, pos, &info[i]);
4226 if (info[i].tile_first > max_tile_first)
4227 max_tile_first = info[i].tile_first;
4230 for (i = 0; i < n; ++i) {
4231 if (info[i].tile_first == max_tile_first)
4232 continue;
4233 info[i].prefix = extend_range(info[i].prefix,
4234 info[i].tile_first, max_tile_first, 0);
4235 info[i].tile_first = max_tile_first;
4238 qsort(info, n, sizeof(struct band_info), &cmp_band);
4240 for (i = 0; i < n - 1; ++i)
4241 if (info[i].tile_len != info[i + 1].tile_len ||
4242 info[i].n_parallel != info[i + 1].n_parallel)
4243 break;
4245 if (i < n -1)
4246 separate_bands(info, n);
4248 prefix = info[0].prefix;
4249 suffix = info[0].suffix;
4251 for (i = 1; i < n; ++i) {
4252 prefix = isl_union_map_union(prefix, info[i].prefix);
4253 suffix = isl_union_map_union(suffix, info[i].suffix);
4256 list_info->tile_first = info[0].tile_first;
4257 list_info->tile_len = -1;
4258 list_info->prefix = prefix;
4259 list_info->suffix = suffix;
4261 isl_band_list_free(list);
4262 free(info);
4265 /* Set max_out to the maximal number of output dimensions over
4266 * all maps.
4268 static int update_max_out(__isl_take isl_map *map, void *user)
4270 int *max_out = user;
4271 int n_out = isl_map_dim(map, isl_dim_out);
4273 if (n_out > *max_out)
4274 *max_out = n_out;
4276 isl_map_free(map);
4277 return 0;
4280 struct align_range_data {
4281 int max_out;
4282 isl_union_map *res;
4285 /* Extend the dimension of the range of the given map to data->max_out and
4286 * then add the result to data->res.
4288 static int map_align_range(__isl_take isl_map *map, void *user)
4290 struct align_range_data *data = user;
4291 int i;
4292 isl_space *dim;
4293 isl_map *proj;
4294 int n_out = isl_map_dim(map, isl_dim_out);
4296 dim = isl_union_map_get_space(data->res);
4297 proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4298 for (i = n_out; i < data->max_out; ++i)
4299 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4301 map = isl_map_apply_range(map, proj);
4303 data->res = isl_union_map_add_map(data->res, map);
4305 return 0;
4308 /* Extend the ranges of the maps in the union map such they all have
4309 * the same dimension.
4311 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4313 struct align_range_data data;
4315 data.max_out = 0;
4316 isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4318 data.res = isl_union_map_empty(isl_union_map_get_space(umap));
4319 isl_union_map_foreach_map(umap, &map_align_range, &data);
4321 isl_union_map_free(umap);
4322 return data.res;
4325 /* Select the outermost tilable band that (by construction)
4326 * has at least one parallel loop.
4327 * The starting position of the aligned band is stored in the pair
4328 * gen->tile_first.
4329 * The sizes and number of parallel loops may be different in different
4330 * parts of the band forest and are therefore stored in the cuda_stmts.
4332 * Return the complete schedule, with the tilable bands aligned
4333 * at gen->tile_first and padded with zero, if needed.
4335 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4336 __isl_keep isl_schedule *schedule)
4338 isl_band_list *list;
4339 struct band_info info;
4341 gen->n_parallel = 0;
4342 gen->tile_len = -1;
4344 list = isl_schedule_get_band_forest(schedule);
4346 list_select_outer_band(gen, list, 0, &info);
4348 gen->tile_first = info.tile_first;
4349 info.suffix = align_range(info.suffix);
4351 return isl_union_map_flat_range_product(info.prefix, info.suffix);
4354 /* Set gen->untiled_len to the number of scheduling dimensions
4355 * for the schedule of the first domain.
4356 * We assume here that this number is the same for all domains.
4358 static int set_untiled_len(__isl_take isl_map *map, void *user)
4360 unsigned *untiled_len = user;
4362 *untiled_len = isl_map_dim(map, isl_dim_out);
4364 isl_map_free(map);
4365 return -1;
4368 /* Compute an appropriate schedule based on the accesses in
4369 * gen->read and gen->write.
4371 * We first compute dependences and then use those to compute
4372 * a schedule that has a parallel loop in each tilable band.
4373 * Finally, we select the outermost tilable band.
4375 static void compute_schedule(struct cuda_gen *gen,
4376 __isl_take isl_union_map *sched)
4378 isl_ctx *ctx = isl_union_map_get_ctx(sched);
4379 isl_union_set *domain;
4380 isl_union_map *empty;
4381 isl_union_map *dep_raw, *dep2, *dep3, *dep;
4382 isl_union_map *uninitialized;
4383 isl_schedule *schedule;
4385 empty = isl_union_map_empty(isl_union_map_get_space(sched));
4387 isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4388 isl_union_map_copy(gen->write), empty,
4389 isl_union_map_copy(sched),
4390 &dep_raw, NULL, &uninitialized, NULL);
4391 isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4392 isl_union_map_copy(gen->write),
4393 isl_union_map_copy(gen->read),
4394 isl_union_map_copy(sched),
4395 &dep2, &dep3, NULL, NULL);
4396 isl_union_map_free(sched);
4398 gen->copy_in = isl_union_map_range(uninitialized);
4400 dep = isl_union_map_union(dep2, dep3);
4401 dep = isl_union_map_union(dep, dep_raw);
4402 dep = isl_union_map_coalesce(dep);
4404 domain = extract_domain(gen);
4405 schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4406 isl_union_map_copy(dep), dep);
4408 sched = select_outer_tilable_band(gen, schedule);
4410 isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4411 sched = isl_union_map_intersect_domain(sched, domain);
4412 gen->sched = sched;
4414 isl_schedule_free(schedule);
4417 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4418 struct cuda_stmt_access **next_access)
4420 struct cuda_stmt_access *access;
4421 isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4423 access = isl_alloc_type(ctx, struct cuda_stmt_access);
4424 assert(access);
4425 access->next = NULL;
4426 access->read = expr->acc.read;
4427 access->write = expr->acc.write;
4428 access->access = isl_map_copy(expr->acc.access);
4430 *next_access = access;
4431 next_access = &(*next_access)->next;
4432 return next_access;
4435 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4436 struct cuda_stmt_access **next_access)
4438 int i;
4440 for (i = 0; i < expr->n_arg; ++i)
4441 next_access = expr_extract_accesses(expr->args[i],
4442 next_access);
4444 if (expr->type == pet_expr_access)
4445 next_access = expr_extract_access(expr, next_access);
4447 return next_access;
4450 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4452 struct cuda_stmt_access **next_access = &stmt->accesses;
4454 stmt->accesses = NULL;
4455 expr_extract_accesses(stmt->body, next_access);
4458 /* Return an array of cuda_stmt representing the statements in "scop".
4460 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4461 __isl_keep isl_set *context)
4463 int i;
4464 struct cuda_stmt *stmts;
4466 stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4467 assert(stmts);
4469 for (i = 0; i < scop->n_stmt; ++i) {
4470 struct cuda_stmt *s = &stmts[i];
4472 s->domain = isl_set_copy(scop->stmts[i]->domain);
4473 s->domain = isl_set_intersect_params(s->domain,
4474 isl_set_copy(context));
4475 s->body = scop->stmts[i]->body;
4476 pet_stmt_extract_accesses(s);
4479 return stmts;
4482 /* Replace the scop in the "input" file by equivalent code
4483 * that uses the GPU. "scop" is assumed to correspond to this scop.
4485 * We first compute a schedule that respects the dependences
4486 * of the original program and select the outermost band
4487 * of tilable dimensions that has at least one parallel loop.
4488 * We then have three blocks of dimensions
4490 * H B G
4492 * The tilable band "B" is first tiled according to "tile" sizes, resulting
4493 * in
4495 * H T P G
4497 * For each iteration of the T loop and for each array, we compute
4498 * the array elements accessed by that iteration, construct a rectangular
4499 * box around it and shift it to the origin. The result is used
4500 * as shared memory for the array.
4502 * We then split off at most 2 parallel loops from the T loops and
4503 * at most 3 parallel loops from the P loops
4505 * H T1 T2 P1 P2 G
4507 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4508 * according to "grid"/"block" sizes.
4510 * H T1T T1P T2 P1T P1P P2 G
4512 * Finally, the T1P and P1P iterators are equated to the block and
4513 * thread dimensions respectively and so are effectively removed.
4514 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4515 * are run on the GPU.
4517 * Code is generated in three stages. We first generate code for the
4518 * host (the H loops), with iterators h%d. Then, for each leaf node
4519 * of the resulting AST, we generate code for the shared loops (up to
4520 * and including T2), with iterators g%d and after equating the H loops
4521 * to h%d parameters and the T1P loops to the block dimensions.
4522 * Finally, we generate code for the remaining loops in a similar fashion.
4524 int cuda_pet(isl_ctx *ctx, struct pet_scop *scop, struct ppcg_options *options,
4525 const char *input)
4527 isl_union_map *sched;
4528 struct cuda_gen gen;
4530 if (!scop)
4531 return -1;
4533 scop = pet_scop_align_params(scop);
4535 gen.ctx = ctx;
4536 gen.context = isl_set_copy(scop->context);
4537 gen.context = add_context_from_str(gen.context, options->ctx);
4538 gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4539 gen.n_stmts = scop->n_stmt;
4540 gen.stmts = extract_stmts(ctx, scop, gen.context);
4541 gen.read = pet_scop_collect_reads(scop);
4542 gen.write = pet_scop_collect_writes(scop);
4543 gen.options = options;
4544 gen.state = cloog_isl_state_malloc(gen.ctx);
4545 gen.scop = scop;
4547 cuda_open_files(&gen.cuda, input);
4549 collect_array_info(&gen);
4551 sched = pet_scop_collect_schedule(scop);
4553 compute_schedule(&gen, sched);
4555 print_host_code(&gen);
4557 cloog_state_free(gen.state);
4558 clear_cuda_gen(&gen);
4560 cuda_close_files(&gen.cuda);
4562 return 0;