update isl to version 0.10
[ppcg.git] / cuda.c
blobdd0e6b4fc97091d9e9efba634103a68beb661f59
1 /*
2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
8 * 91893 Orsay, France
9 */
11 #include <assert.h>
12 #include <stdlib.h>
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
16 #include <isl/aff.h>
17 #include <isl/ilp.h>
18 #include <isl/flow.h>
19 #include <isl/band.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
24 #include "cuda.h"
25 #include "cuda_common.h"
26 #include "clast_printer.h"
27 #include "schedule.h"
28 #include "pet_printer.h"
29 #include "ppcg_options.h"
31 /* The fields stride, shift and shift_map only contain valid information
32 * if shift != NULL.
33 * If so, they express that current index is such that if you add shift,
34 * then the result is always a multiple of stride.
35 * shift_map contains the mapping
37 * i -> (i + shift)/stride
39 struct cuda_array_bound {
40 isl_int size;
41 isl_aff *lb;
43 isl_int stride;
44 isl_aff *shift;
45 isl_basic_map *shift_map;
48 struct cuda_array_info;
50 /* A group of array references in a kernel that should be handled together.
51 * If private_bound is not NULL, then it is mapped to registers.
52 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
53 * Otherwise, it is accessed from global memory.
55 struct cuda_array_ref_group {
56 /* The references in this group access this array. */
57 struct cuda_array_info *array;
58 /* Position of this group in the list of reference groups of array. */
59 int nr;
61 /* The following fields are use during the construction of the groups.
62 * access is the combined access relation relative to the shared
63 * memory tiling.
64 * write is set if any access in the group is a write.
66 isl_map *access;
67 int write;
69 /* For each index, size and offset of piece in shared memory. */
70 struct cuda_array_bound *shared_bound;
72 /* For each index, size and offset of piece in private memory. */
73 struct cuda_array_bound *private_bound;
75 /* References in this group; point to elements of a linked list. */
76 int n_ref;
77 struct cuda_stmt_access **refs;
79 /* Last shared memory tile dimension that affects tile of this group. */
80 int last_shared;
81 /* Dimension at which copying to/from shared memory is printed.
82 * if >= 0, then the value is >= last_shared
83 * if -1, then the copying is done at the leaf level.
85 int print_shared_level;
88 struct cuda_array_info {
89 isl_space *dim;
90 /* Element type. */
91 char *type;
92 /* Element size. */
93 int size;
94 /* Name of the array. */
95 char *name;
96 /* Number of indices. */
97 unsigned n_index;
98 /* For each index, a bound on the array in that direction. */
99 isl_pw_aff **bound;
100 /* For each index, bound[i] specialized to the current kernel. */
101 isl_pw_aff **local_bound;
103 /* All references to this array; point to elements of a linked list. */
104 int n_ref;
105 struct cuda_stmt_access **refs;
107 /* The reference groups associated to this array. */
108 int n_group;
109 struct cuda_array_ref_group **groups;
111 /* For scalars, is this scalar read-only within the entire program? */
112 int read_only;
115 /* Print the name of the local copy of a given group of array references.
117 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
119 int global = 0;
121 if (group->private_bound)
122 fprintf(out, "private_");
123 else if (group->shared_bound)
124 fprintf(out, "shared_");
125 else
126 global = 1;
127 fprintf(out, "%s", group->array->name);
128 if (!global && group->array->n_group > 1)
129 fprintf(out, "_%d", group->nr);
132 /* Collect all references to the given array and store pointers to them
133 * in array->refs.
135 static void collect_references(struct cuda_gen *gen,
136 struct cuda_array_info *array)
138 int i;
139 int n;
141 n = 0;
142 for (i = 0; i < gen->n_stmts; ++i) {
143 struct cuda_stmt *stmt = &gen->stmts[i];
144 struct cuda_stmt_access *access;
146 for (access = stmt->accesses; access; access = access->next) {
147 const char *name;
148 name = isl_map_get_tuple_name(access->access,
149 isl_dim_out);
150 if (name && !strcmp(array->name, name))
151 n++;
155 array->n_ref = n;
156 array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
157 assert(array->refs);
159 n = 0;
160 for (i = 0; i < gen->n_stmts; ++i) {
161 struct cuda_stmt *stmt = &gen->stmts[i];
162 struct cuda_stmt_access *access;
164 for (access = stmt->accesses; access; access = access->next) {
165 const char *name;
166 name = isl_map_get_tuple_name(access->access,
167 isl_dim_out);
168 if (!name || strcmp(array->name, name))
169 continue;
171 array->refs[n++] = access;
176 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
178 int i;
179 struct cuda_array_bound *bound;
181 bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
182 assert(bound);
184 for (i = 0; i < n_index; ++i) {
185 isl_int_init(bound[i].size);
186 bound[i].lb = NULL;
187 isl_int_init(bound[i].stride);
188 bound[i].shift = NULL;
189 bound[i].shift_map = NULL;
192 return bound;
195 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
197 int j;
199 if (!bound)
200 return;
202 for (j = 0; j < n_index; ++j) {
203 isl_int_clear(bound[j].size);
204 isl_int_clear(bound[j].stride);
205 isl_aff_free(bound[j].lb);
206 isl_aff_free(bound[j].shift);
207 isl_basic_map_free(bound[j].shift_map);
209 free(bound);
212 static struct pet_array *find_array(struct pet_scop *scop,
213 __isl_keep isl_set *accessed)
215 int i;
216 isl_id *id;
218 id = isl_set_get_tuple_id(accessed);
220 for (i = 0; i < scop->n_array; ++i) {
221 isl_id *id_i;
223 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
224 isl_id_free(id_i);
225 if (id == id_i)
226 break;
228 isl_id_free(id);
230 return i < scop->n_array ? scop->arrays[i] : NULL;
233 /* Compute bounds on the host arrays based on the accessed elements
234 * and collect all references to the array.
236 * If the array is zero-dimensional, i.e., a scalar, we check
237 * whether it is read-only.
239 static int extract_array_info(__isl_take isl_set *array, void *user)
241 int i;
242 struct cuda_gen *gen = (struct cuda_gen *)user;
243 const char *name;
244 int n_index;
245 isl_pw_aff **bounds;
246 isl_pw_aff **local_bounds;
247 struct pet_array *pa;
249 n_index = isl_set_dim(array, isl_dim_set);
250 name = isl_set_get_tuple_name(array);
251 bounds = isl_alloc_array(isl_set_get_ctx(array),
252 isl_pw_aff *, n_index);
253 assert(bounds);
254 local_bounds = isl_calloc_array(isl_set_get_ctx(array),
255 isl_pw_aff *, n_index);
256 assert(local_bounds);
257 gen->array[gen->n_array].dim = isl_set_get_space(array);
258 gen->array[gen->n_array].name = strdup(name);
259 gen->array[gen->n_array].n_index = n_index;
260 gen->array[gen->n_array].bound = bounds;
261 gen->array[gen->n_array].local_bound = local_bounds;
263 pa = find_array(gen->scop, array);
264 assert(pa);
266 gen->array[gen->n_array].type = strdup(pa->element_type);
267 gen->array[gen->n_array].size = pa->element_size;
269 if (n_index == 0) {
270 isl_set *space;
271 isl_union_map *write;
272 int empty;
274 write = isl_union_map_copy(gen->write);
275 space = isl_set_universe(isl_set_get_space(array));
276 write = isl_union_map_intersect_range(write,
277 isl_union_set_from_set(space));
278 empty = isl_union_map_is_empty(write);
279 isl_union_map_free(write);
281 gen->array[gen->n_array].read_only = empty;
284 for (i = 0; i < n_index; ++i) {
285 isl_set *dom;
286 isl_local_space *ls;
287 isl_aff *one;
288 isl_pw_aff *bound;
289 isl_set *size = i == 0 ? array : pa->extent;
291 bound = isl_set_dim_max(isl_set_copy(size), i);
292 assert(bound);
293 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
294 ls = isl_local_space_from_space(isl_set_get_space(dom));
295 one = isl_aff_zero_on_domain(ls);
296 one = isl_aff_add_constant_si(one, 1);
297 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
298 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
300 bounds[i] = bound;
303 collect_references(gen, &gen->array[gen->n_array]);
305 gen->n_array++;
307 isl_set_free(array);
308 return 0;
311 void collect_array_info(struct cuda_gen *gen)
313 isl_union_set *arrays;
315 arrays = isl_union_map_range(isl_union_map_copy(gen->read));
316 arrays = isl_union_set_union(arrays,
317 isl_union_map_range(isl_union_map_copy(gen->write)));
318 arrays = isl_union_set_coalesce(arrays);
320 gen->n_array = isl_union_set_n_set(arrays);
321 gen->array = isl_alloc_array(gen->ctx,
322 struct cuda_array_info, gen->n_array);
323 assert(gen->array);
324 gen->n_array = 0;
325 isl_union_set_foreach_set(arrays, &extract_array_info, gen);
326 isl_union_set_free(arrays);
329 static void free_array_info(struct cuda_gen *gen)
331 int i, j;
333 for (i = 0; i < gen->n_array; ++i) {
334 int n_index = gen->array[i].n_index;
335 free(gen->array[i].type);
336 free(gen->array[i].name);
337 for (j = 0; j < n_index; ++j) {
338 isl_pw_aff_free(gen->array[i].bound[j]);
339 isl_pw_aff_free(gen->array[i].local_bound[j]);
341 isl_space_free(gen->array[i].dim);
342 free(gen->array[i].bound);
343 free(gen->array[i].local_bound);
344 free(gen->array[i].refs);
346 free(gen->array);
349 /* Check if a cuda array is a scalar. A scalar is a value that is not stored
350 * as an array or through a pointer reference, but as single data element. At
351 * the moment, scalars are represented as zero dimensional arrays.
353 static int cuda_array_is_scalar(struct cuda_array_info *array)
355 return (array->n_index == 0);
358 /* Is "array" a read-only scalar?
360 static int cuda_array_is_read_only_scalar(struct cuda_array_info *array)
362 return cuda_array_is_scalar(array) && array->read_only;
365 static void declare_device_arrays(struct cuda_gen *gen)
367 int i;
369 for (i = 0; i < gen->n_array; ++i) {
370 if (cuda_array_is_read_only_scalar(&gen->array[i]))
371 continue;
372 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
373 gen->array[i].type, gen->array[i].name);
375 fprintf(gen->cuda.host_c, "\n");
378 static void print_array_size(struct cuda_gen *gen, FILE *out,
379 struct cuda_array_info *array)
381 int i;
382 isl_printer *prn;
384 prn = isl_printer_to_file(gen->ctx, out);
385 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
386 for (i = 0; i < array->n_index; ++i) {
387 prn = isl_printer_print_str(prn, "(");
388 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
389 prn = isl_printer_print_str(prn, ") * ");
391 prn = isl_printer_print_str(prn, "sizeof(");
392 prn = isl_printer_print_str(prn, array->type);
393 prn = isl_printer_print_str(prn, ")");
394 isl_printer_free(prn);
397 static void allocate_device_arrays(struct cuda_gen *gen)
399 int i;
401 for (i = 0; i < gen->n_array; ++i) {
402 if (cuda_array_is_read_only_scalar(&gen->array[i]))
403 continue;
404 fprintf(gen->cuda.host_c,
405 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
406 gen->array[i].name);
407 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
408 fprintf(gen->cuda.host_c, "));\n");
410 fprintf(gen->cuda.host_c, "\n");
413 static void free_device_arrays(struct cuda_gen *gen)
415 int i;
417 for (i = 0; i < gen->n_array; ++i) {
418 if (cuda_array_is_read_only_scalar(&gen->array[i]))
419 continue;
420 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaFree(dev_%s));\n",
421 gen->array[i].name);
425 static void copy_arrays_to_device(struct cuda_gen *gen)
427 int i;
429 for (i = 0; i < gen->n_array; ++i) {
430 isl_space *dim;
431 isl_set *read_i;
432 int empty;
434 if (cuda_array_is_read_only_scalar(&gen->array[i]))
435 continue;
437 dim = isl_space_copy(gen->array[i].dim);
438 read_i = isl_union_set_extract_set(gen->copy_in, dim);
439 empty = isl_set_fast_is_empty(read_i);
440 isl_set_free(read_i);
441 if (empty)
442 continue;
444 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(dev_%s,",
445 gen->array[i].name);
447 if (cuda_array_is_scalar(&(gen->array[i])))
448 fprintf(gen->cuda.host_c, " &%s, ",
449 gen->array[i].name);
450 else
451 fprintf(gen->cuda.host_c, " %s, ", gen->array[i].name);
453 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
454 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice));\n");
456 fprintf(gen->cuda.host_c, "\n");
459 static void copy_arrays_from_device(struct cuda_gen *gen)
461 int i;
462 isl_union_set *write;
463 write = isl_union_map_range(isl_union_map_copy(gen->write));
465 for (i = 0; i < gen->n_array; ++i) {
466 isl_space *dim;
467 isl_set *write_i;
468 int empty;
470 dim = isl_space_copy(gen->array[i].dim);
471 write_i = isl_union_set_extract_set(write, dim);
472 empty = isl_set_fast_is_empty(write_i);
473 isl_set_free(write_i);
474 if (empty)
475 continue;
477 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(");
478 if (cuda_array_is_scalar(&gen->array[i]))
479 fprintf(gen->cuda.host_c, "&%s, ", gen->array[i].name);
480 else
481 fprintf(gen->cuda.host_c, "%s, ", gen->array[i].name);
482 fprintf(gen->cuda.host_c, "dev_%s, ", gen->array[i].name);
483 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
484 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost));\n");
487 isl_union_set_free(write);
488 fprintf(gen->cuda.host_c, "\n");
491 /* Internal data structure for extract_size_of_type.
492 * "type" specifies the name of the space that we want to extract.
493 * "res" is used to store the subset of that space.
495 struct ppcg_extract_size_data {
496 const char *type;
497 isl_set *res;
500 /* This function is called for each set in a union_set.
501 * If the name of the set matches data->type, we store the
502 * set in data->res.
504 static int extract_size_of_type(__isl_take isl_set *size, void *user)
506 struct ppcg_extract_size_data *data = user;
507 const char *name;
509 name = isl_set_get_tuple_name(size);
510 if (name && !strcmp(name, data->type)) {
511 data->res = size;
512 return -1;
515 isl_set_free(size);
516 return 0;
519 /* Given a union map { kernel[i] -> *[...] },
520 * return the range in the space called "type" for the kernel with
521 * sequence number "id".
523 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
524 const char *type, int id)
526 isl_space *space;
527 isl_set *dom;
528 isl_union_set *local_sizes;
529 struct ppcg_extract_size_data data = { type, NULL };
531 if (!sizes)
532 return NULL;
534 space = isl_union_map_get_space(sizes);
535 space = isl_space_set_from_params(space);
536 space = isl_space_add_dims(space, isl_dim_set, 1);
537 space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
538 dom = isl_set_universe(space);
539 dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
541 local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
542 isl_union_map_copy(sizes));
543 isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
544 isl_union_set_free(local_sizes);
545 return data.res;
548 /* Given a singleton set, extract the first (at most *len) elements
549 * of the single integer tuple into *sizes and update *len if needed.
551 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
553 int i;
554 int dim;
555 isl_int v;
557 if (!set)
558 return;
560 dim = isl_set_dim(set, isl_dim_set);
561 if (dim < *len)
562 *len = dim;
564 isl_int_init(v);
566 for (i = 0; i < *len; ++i) {
567 int ok;
569 ok = isl_set_plain_is_fixed(set, isl_dim_set, i, &v);
570 assert(ok);
572 sizes[i] = isl_int_get_si(v);
575 isl_int_clear(v);
577 isl_set_free(set);
580 /* Extract user specified "tile" sizes from the "sizes" command line option,
581 * defaulting to option->tile_size in each dimension.
583 static void read_tile_sizes(struct cuda_gen *gen)
585 int n;
586 isl_set *size;
588 gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
589 assert(gen->tile_size);
590 for (n = 0; n < gen->tile_len; ++n)
591 gen->tile_size[n] = gen->options->tile_size;
593 size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
594 read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
596 if (gen->n_parallel > gen->tile_len)
597 gen->n_parallel = gen->tile_len;
600 /* Extract user specified "block" sizes from the "sizes" command line option,
601 * after filling in some potentially useful defaults.
603 static void read_block_sizes(struct cuda_gen *gen)
605 int n;
606 isl_set *size;
608 n = gen->n_parallel;
609 gen->n_block = (n <= 3) ? n : 3;
610 switch (gen->n_block) {
611 case 1:
612 gen->block_dim[0] = 512;
613 break;
614 case 2:
615 gen->block_dim[0] = 32;
616 gen->block_dim[1] = 16;
617 break;
618 default:
619 gen->block_dim[0] = 32;
620 gen->block_dim[1] = 4;
621 gen->block_dim[2] = 4;
622 break;
625 size = extract_sizes(gen->sizes, "block", gen->kernel_id);
626 read_sizes_from_set(size, gen->block_dim, &gen->n_block);
629 /* Extract user specified "grid" sizes from the "sizes" command line option,
630 * after filling in some potentially useful defaults.
632 static void read_grid_sizes(struct cuda_gen *gen)
634 int n = gen->n_parallel;
635 isl_set *size;
637 gen->n_grid = (n <= 2) ? n : 2;
638 switch (gen->n_grid) {
639 case 1:
640 gen->grid_dim[0] = 32768;
641 break;
642 default:
643 gen->grid_dim[0] = 256;
644 gen->grid_dim[1] = 256;
645 break;
648 size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
649 read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
652 /* Extract user specified sizes from the "sizes" command line option
653 * after filling in some potentially useful defaults.
655 static void read_sizes(struct cuda_gen *gen)
657 read_tile_sizes(gen);
658 read_block_sizes(gen);
659 read_grid_sizes(gen);
662 static void free_stmts(struct cuda_stmt *stmts, int n)
664 int i;
666 for (i = 0; i < n; ++i) {
667 struct cuda_stmt_access *access, *next;
669 for (access = stmts[i].accesses; access; access = next) {
670 next = access->next;
671 isl_map_free(access->access);
672 free(access);
675 isl_set_free(stmts[i].domain);
677 free(stmts);
680 void clear_cuda_gen(struct cuda_gen *gen)
682 free_stmts(gen->stmts, gen->n_stmts);
683 free_array_info(gen);
684 isl_union_map_free(gen->sizes);
685 isl_set_free(gen->context);
686 isl_union_set_free(gen->copy_in);
687 isl_union_map_free(gen->sched);
688 isl_union_map_free(gen->read);
689 isl_union_map_free(gen->write);
692 static void print_reverse_list(FILE *out, int len, int *list)
694 int i;
696 if (len == 0)
697 return;
699 fprintf(out, "(");
700 for (i = 0; i < len; ++i) {
701 if (i)
702 fprintf(out, ", ");
703 fprintf(out, "%d", list[len - 1 - i]);
705 fprintf(out, ")");
708 static void print_kernel_launch(struct cuda_gen *gen,
709 __isl_keep isl_union_set *arrays)
711 int i;
712 int first = 1;
713 unsigned nparam;
714 isl_space *dim;
716 print_indent(gen->code.dst, gen->code.indent);
717 fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
718 gen->kernel_id, gen->kernel_id, gen->kernel_id);
719 fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
720 gen->kernel_id);
721 fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
722 gen->kernel_id);
724 for (i = 0; i < gen->n_array; ++i) {
725 isl_space *dim;
726 isl_set *arr;
727 int empty;
729 dim = isl_space_copy(gen->array[i].dim);
730 arr = isl_union_set_extract_set(arrays, dim);
731 empty = isl_set_fast_is_empty(arr);
732 isl_set_free(arr);
733 if (empty)
734 continue;
736 if (!first) {
737 fprintf(gen->code.dst, ", ");
738 fprintf(gen->cuda.kernel_c, ", ");
739 fprintf(gen->cuda.kernel_h, ", ");
742 if (cuda_array_is_read_only_scalar(&gen->array[i])) {
743 fprintf(gen->code.dst, "%s", gen->array[i].name);
744 fprintf(gen->cuda.kernel_c, "%s %s",
745 gen->array[i].type, gen->array[i].name);
746 fprintf(gen->cuda.kernel_h, "%s %s",
747 gen->array[i].type, gen->array[i].name);
748 } else {
749 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
750 fprintf(gen->cuda.kernel_c, "%s *%s",
751 gen->array[i].type, gen->array[i].name);
752 fprintf(gen->cuda.kernel_h, "%s *%s",
753 gen->array[i].type, gen->array[i].name);
756 first = 0;
759 dim = isl_union_set_get_space(arrays);
760 nparam = isl_space_dim(dim, isl_dim_param);
761 for (i = 0; i < nparam; ++i) {
762 const char *name = isl_space_get_dim_name(dim, isl_dim_param, i);
763 if (!first) {
764 fprintf(gen->code.dst, ", ");
765 fprintf(gen->cuda.kernel_c, ", ");
766 fprintf(gen->cuda.kernel_h, ", ");
768 fprintf(gen->code.dst, "%s", name);
769 fprintf(gen->cuda.kernel_c, "int %s", name);
770 fprintf(gen->cuda.kernel_h, "int %s", name);
771 first = 0;
773 isl_space_free(dim);
775 for (i = 0; i < gen->tile_first; ++i) {
776 if (!first) {
777 fprintf(gen->code.dst, ", ");
778 fprintf(gen->cuda.kernel_c, ", ");
779 fprintf(gen->cuda.kernel_h, ", ");
781 fprintf(gen->code.dst, "h%d", i);
782 fprintf(gen->cuda.kernel_c, "int h%d", i);
783 fprintf(gen->cuda.kernel_h, "int h%d", i);
784 first = 0;
787 fprintf(gen->code.dst, ");\n");
788 fprintf(gen->cuda.kernel_c, ")\n");
789 fprintf(gen->cuda.kernel_h, ");\n");
791 fprintf(gen->code.dst, "cudaCheckKernel();\n");
794 /* Construct a map from a domain of dimensionality "len"
795 * to a domain of dimensionality "len" + "tile_len" that tiles
796 * the "tile_len" coordinates starting at "first".
797 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
798 * "dim" prescribes the parameters.
800 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
801 int first, int tile_len, int *tile_size)
803 int i;
804 isl_int v;
805 isl_basic_map *bmap;
806 isl_constraint *c;
807 isl_local_space *ls;
809 isl_int_init(v);
811 dim = isl_space_add_dims(dim, isl_dim_in, len);
812 dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
813 bmap = isl_basic_map_universe(isl_space_copy(dim));
814 ls = isl_local_space_from_space(dim);
816 for (i = 0; i < len - tile_len; ++i) {
817 int j = i < first ? i : i + tile_len;
818 int k = i < first ? i : i + 2 * tile_len;
820 c = isl_equality_alloc(isl_local_space_copy(ls));
821 isl_int_set_si(v, -1);
822 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
823 isl_int_set_si(v, 1);
824 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
825 bmap = isl_basic_map_add_constraint(bmap, c);
828 for (i = 0; i < tile_len; ++i) {
829 c = isl_equality_alloc(isl_local_space_copy(ls));
830 isl_int_set_si(v, -1);
831 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
832 isl_int_set_si(v, tile_size[i]);
833 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
834 isl_int_set_si(v, 1);
835 isl_constraint_set_coefficient(c, isl_dim_out,
836 first + i + tile_len, v);
837 bmap = isl_basic_map_add_constraint(bmap, c);
839 c = isl_inequality_alloc(isl_local_space_copy(ls));
840 isl_int_set_si(v, 1);
841 isl_constraint_set_coefficient(c, isl_dim_out,
842 first + i + tile_len, v);
843 bmap = isl_basic_map_add_constraint(bmap, c);
845 c = isl_inequality_alloc(isl_local_space_copy(ls));
846 isl_int_set_si(v, -1);
847 isl_constraint_set_coefficient(c, isl_dim_out,
848 first + i + tile_len, v);
849 isl_int_set_si(v, tile_size[i] - 1);
850 isl_constraint_set_constant(c, v);
851 bmap = isl_basic_map_add_constraint(bmap, c);
854 isl_local_space_free(ls);
855 isl_int_clear(v);
857 return isl_map_from_basic_map(bmap);
860 /* Construct a map from a domain of dimensionality "len"
861 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
862 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
863 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
864 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
865 * that are projected out at the end.
866 * "dim" prescribes the parameters.
868 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
869 int first, int wrap_len, int *wrap_size)
871 int i;
872 isl_basic_map *bmap;
873 isl_constraint *c;
874 isl_local_space *ls;
876 dim = isl_space_add_dims(dim, isl_dim_in, len);
877 dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
878 bmap = isl_basic_map_universe(isl_space_copy(dim));
879 ls = isl_local_space_from_space(dim);
881 for (i = 0; i < len; ++i) {
882 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
884 c = isl_equality_alloc(isl_local_space_copy(ls));
885 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
886 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
887 bmap = isl_basic_map_add_constraint(bmap, c);
890 for (i = 0; i < wrap_len; ++i) {
891 c = isl_equality_alloc(isl_local_space_copy(ls));
892 isl_constraint_set_coefficient_si(c, isl_dim_out,
893 first + i, -1);
894 isl_constraint_set_coefficient_si(c, isl_dim_out,
895 first + wrap_len + i, 1);
896 isl_constraint_set_coefficient_si(c, isl_dim_out,
897 first + 2 * wrap_len + i, wrap_size[i]);
898 bmap = isl_basic_map_add_constraint(bmap, c);
900 c = isl_inequality_alloc(isl_local_space_copy(ls));
901 isl_constraint_set_coefficient_si(c, isl_dim_out,
902 first + wrap_len + i, 1);
903 bmap = isl_basic_map_add_constraint(bmap, c);
905 c = isl_inequality_alloc(isl_local_space_copy(ls));
906 isl_constraint_set_coefficient_si(c, isl_dim_out,
907 first + wrap_len + i, -1);
908 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
909 bmap = isl_basic_map_add_constraint(bmap, c);
912 isl_local_space_free(ls);
914 bmap = isl_basic_map_project_out(bmap, isl_dim_out,
915 first + 2 * wrap_len, wrap_len);
917 return isl_map_from_basic_map(bmap);
920 /* Add "n" parameters named prefix%d.
922 static __isl_give isl_set *add_params( __isl_take isl_set *set,
923 int n, const char *prefix)
925 int i;
926 unsigned nparam;
927 char name[20];
929 nparam = isl_set_dim(set, isl_dim_param);
930 set = isl_set_add_dims(set, isl_dim_param, n);
932 for (i = 0; i < n; ++i) {
933 snprintf(name, sizeof(name), "%s%d", prefix, i);
934 set = isl_set_set_dim_name(set, isl_dim_param,
935 nparam + i, name);
938 return set;
941 /* Equate the "n" dimensions of "set" starting at "first" to
942 * freshly created parameters named prefix%d.
944 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
945 int first, int n, const char *prefix)
947 int i;
948 unsigned nparam;
949 isl_int v;
950 isl_space *dim;
951 isl_basic_set *bset;
952 isl_constraint *c;
953 isl_local_space *ls;
955 nparam = isl_set_dim(set, isl_dim_param);
957 set = add_params(set, n, prefix);
959 dim = isl_set_get_space(set);
960 bset = isl_basic_set_universe(isl_space_copy(dim));
961 ls = isl_local_space_from_space(dim);
963 isl_int_init(v);
965 for (i = 0; i < n; ++i) {
966 c = isl_equality_alloc(isl_local_space_copy(ls));
967 isl_int_set_si(v, -1);
968 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
969 isl_int_set_si(v, 1);
970 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
971 bset = isl_basic_set_add_constraint(bset, c);
974 isl_int_clear(v);
975 isl_local_space_free(ls);
977 return isl_set_intersect(set, isl_set_from_basic_set(bset));
980 static __isl_give isl_set *parametrization(__isl_take isl_space *dim,
981 int len, int first, int n, const char *prefix)
983 isl_set *set;
985 dim = isl_space_add_dims(dim, isl_dim_set, len);
986 set = isl_set_universe(dim);
988 return parametrize(set, first, n, prefix);
991 /* Tile the B loops over the tile sizes and then tile/wrap
992 * the T1 loops over the blocks.
994 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
995 __isl_take isl_union_map *sched)
997 isl_space *dim;
998 isl_map *tiling, *block_tiling;
1000 dim = isl_union_map_get_space(sched);
1001 tiling = tile(isl_space_copy(dim), gen->untiled_len,
1002 gen->tile_first, gen->tile_len, gen->tile_size);
1004 if (gen->options->wrap)
1005 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1006 gen->tile_first, gen->n_grid, gen->grid_dim);
1007 else
1008 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1009 gen->tile_first, gen->n_grid, gen->grid_dim);
1011 gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1013 tiling = isl_map_apply_range(tiling, block_tiling);
1015 sched = isl_union_map_apply_range(sched,
1016 isl_union_map_from_map(tiling));
1018 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1020 return sched;
1023 static __isl_give isl_union_map *parametrize_tiled_schedule(
1024 struct cuda_gen *gen, __isl_take isl_union_map *sched)
1026 isl_space *dim;
1027 isl_set *par;
1029 dim = isl_union_map_get_space(sched);
1030 par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
1031 sched = isl_union_map_intersect_range(sched,
1032 isl_union_set_from_set(par));
1034 dim = isl_union_map_get_space(sched);
1035 par = parametrization(dim, gen->tiled_len,
1036 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1037 sched = isl_union_map_intersect_range(sched,
1038 isl_union_set_from_set(par));
1040 return sched;
1043 /* Tile/wrap the P1 loops over the threads.
1045 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
1046 __isl_take isl_union_map *sched)
1048 isl_space *dim;
1049 isl_map *tiling;
1050 isl_set *par;
1052 dim = isl_union_map_get_space(sched);
1054 if (gen->options->wrap)
1055 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1056 gen->shared_len, gen->n_block, gen->block_dim);
1057 else
1058 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1059 gen->shared_len, gen->n_block, gen->block_dim);
1060 gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1062 sched = isl_union_map_apply_range(sched,
1063 isl_union_map_from_map(tiling));
1065 par = parametrization(dim, gen->thread_tiled_len,
1066 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1067 gen->n_block, "t");
1068 sched = isl_union_map_intersect_range(sched,
1069 isl_union_set_from_set(par));
1071 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1073 return sched;
1076 /* If the user asked for it, scale the shared memory tile loops
1077 * (T1T and T2) of "sched" by gen->tile_size[i].
1078 * If we are not performing "wrapping", then additionally scale the T1P
1079 * loops by gen->grid_dim[i].
1081 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
1082 __isl_take isl_union_map *sched)
1084 int i;
1085 isl_space *dim;
1086 isl_basic_map *scale;
1087 isl_constraint *c;
1088 isl_local_space *ls;
1090 if (!gen->options->scale_tile_loops)
1091 return sched;
1093 dim = isl_union_map_get_space(sched);
1094 dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1095 dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1096 scale = isl_basic_map_universe(isl_space_copy(dim));
1097 ls = isl_local_space_from_space(dim);
1099 for (i = 0; i < gen->tiled_len; ++i) {
1100 int f = 1;
1102 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1103 f = gen->tile_size[i - gen->tile_first];
1104 if (!gen->options->wrap)
1105 f *= gen->grid_dim[i - gen->tile_first];
1106 } else if (i >= gen->tile_first + gen->n_grid &&
1107 i < gen->tile_first + gen->n_grid + gen->tile_len) {
1108 f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1111 c = isl_equality_alloc(isl_local_space_copy(ls));
1112 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1113 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1114 scale = isl_basic_map_add_constraint(scale, c);
1117 isl_local_space_free(ls);
1119 sched = isl_union_map_apply_range(sched,
1120 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1122 return sched;
1125 /* If we are not performing "wrapping" and if the user asked for it,
1126 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1128 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
1129 __isl_take isl_union_map *sched)
1131 int i;
1132 isl_space *dim;
1133 isl_basic_map *scale;
1134 isl_constraint *c;
1135 isl_local_space *ls;
1137 if (gen->options->wrap)
1138 return sched;
1139 if (!gen->options->scale_tile_loops)
1140 return sched;
1142 dim = isl_union_map_get_space(sched);
1143 dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1144 dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1145 scale = isl_basic_map_universe(isl_space_copy(dim));
1146 ls = isl_local_space_from_space(dim);
1148 for (i = 0; i < gen->thread_tiled_len; ++i) {
1149 int f = 1;
1151 if (i >= gen->shared_len &&
1152 i < gen->shared_len + gen->n_block)
1153 f = gen->block_dim[i - gen->shared_len];
1155 c = isl_equality_alloc(isl_local_space_copy(ls));
1156 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1157 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1158 scale = isl_basic_map_add_constraint(scale, c);
1161 isl_local_space_free(ls);
1163 sched = isl_union_map_apply_range(sched,
1164 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1166 return sched;
1169 /* If we are not performing "wrapping" and if the user asked for it,
1170 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1172 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
1173 __isl_take isl_union_map *sched, int len, int first, int n_tile)
1175 int i;
1176 isl_space *dim;
1177 isl_basic_map *scale;
1178 isl_constraint *c;
1179 isl_local_space *ls;
1181 if (gen->options->wrap)
1182 return sched;
1183 if (!gen->options->scale_tile_loops)
1184 return sched;
1186 dim = isl_union_map_get_space(sched);
1187 dim = isl_space_add_dims(dim, isl_dim_in, len);
1188 dim = isl_space_add_dims(dim, isl_dim_out, len);
1189 scale = isl_basic_map_universe(isl_space_copy(dim));
1190 ls = isl_local_space_from_space(dim);
1192 for (i = 0; i < len; ++i) {
1193 int f = 1;
1195 if (i >= first && i < first + n_tile)
1196 f = gen->block_dim[i - first];
1198 c = isl_equality_alloc(isl_local_space_copy(ls));
1199 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1200 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1201 scale = isl_basic_map_add_constraint(scale, c);
1204 isl_local_space_free(ls);
1206 sched = isl_union_map_apply_range(sched,
1207 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1209 return sched;
1212 /* If print_user_stmt is set, we want to print the statements ourselves,
1213 * instead of relying on the C preprocessor. If so, we need to use
1214 * the stop option so that the domains will be saved on the statement
1215 * nodes.
1217 static void print_cloog_shared_body(struct cuda_gen *gen,
1218 __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1219 void (*print_user_stmt)(struct clast_printer_info *info,
1220 struct clast_user_stmt *s),
1221 int first_unroll)
1223 int i;
1224 CloogOptions *options;
1225 CloogDomain *cloog_context;
1226 CloogUnionDomain *ud;
1227 CloogInput *input;
1228 struct clast_stmt *stmt;
1229 char name[20];
1231 sched = isl_union_map_copy(sched);
1232 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
1234 options = cloog_options_malloc(gen->state);
1235 options->language = CLOOG_LANGUAGE_C;
1236 options->strides = 1;
1237 options->sh = 1;
1238 options->f = len;
1239 options->l = -1;
1240 options->override = 1;
1241 options->save_domains = 1;
1242 options->noscalars = 1;
1243 options->first_unroll = first_unroll;
1245 ud = cloog_union_domain_from_isl_union_map(sched);
1246 for (i = 0; i < len; ++i) {
1247 snprintf(name, sizeof(name), "c%d", i);
1248 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1250 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1251 input = cloog_input_alloc(cloog_context, ud);
1253 stmt = cloog_clast_create_from_input(input, options);
1255 gen->stmt_code.indent = gen->kernel_code.indent;
1256 gen->stmt_code.dst = gen->cuda.kernel_c;
1257 gen->stmt_code.print_user_stmt = print_user_stmt;
1258 gen->stmt_code.print_user_stmt_list = NULL;
1259 gen->stmt_code.print_for_head = NULL;
1260 gen->stmt_code.print_for_foot = NULL;
1261 gen->stmt_code.user = gen;
1262 print_clast(&gen->stmt_code, stmt);
1264 cloog_clast_free(stmt);
1265 cloog_options_free(options);
1268 /* Add "len" parameters p[i] called prefix%d,
1269 * with bounds to 0 <= p[i] < size[i].
1271 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1272 int len, int *size, const char *prefix)
1274 int i;
1275 unsigned nparam;
1276 isl_int v;
1277 isl_space *dim;
1278 isl_basic_set *bset;
1279 isl_constraint *c;
1280 isl_local_space *ls;
1281 char name[20];
1283 nparam = isl_set_dim(set, isl_dim_param);
1284 set = isl_set_add_dims(set, isl_dim_param, len);
1286 for (i = 0; i < len; ++i) {
1287 snprintf(name, sizeof(name), "%s%d", prefix, i);
1288 set = isl_set_set_dim_name(set, isl_dim_param,
1289 nparam + i, name);
1292 dim = isl_set_get_space(set);
1293 bset = isl_basic_set_universe(isl_space_copy(dim));
1294 ls = isl_local_space_from_space(dim);
1296 isl_int_init(v);
1298 for (i = 0; i < len; ++i) {
1299 c = isl_inequality_alloc(isl_local_space_copy(ls));
1300 isl_int_set_si(v, 1);
1301 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1302 bset = isl_basic_set_add_constraint(bset, c);
1304 c = isl_inequality_alloc(isl_local_space_copy(ls));
1305 isl_int_set_si(v, -1);
1306 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1307 isl_int_set_si(v, size[i] - 1);
1308 isl_constraint_set_constant(c, v);
1309 bset = isl_basic_set_add_constraint(bset, c);
1312 isl_int_clear(v);
1313 isl_local_space_free(ls);
1315 return isl_set_intersect(set, isl_set_from_basic_set(bset));
1318 static void print_shared_body(struct cuda_gen *gen,
1319 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1320 int len, void (*print_user_stmt)(struct clast_printer_info *info,
1321 struct clast_user_stmt *s),
1322 int first_unroll)
1324 isl_set *context;
1326 context = isl_set_copy(shared_domain);
1327 context = parametrize(context, 0, gen->shared_len, "g");
1328 context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1329 context = add_bounded_parameters(context,
1330 gen->n_block, gen->block_dim, "t");
1332 print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1333 first_unroll);
1335 isl_set_free(context);
1338 /* Given a tile of an array, construct a map that maps each element
1339 * of the tile to a copy of the tile shifted to the origin
1340 * (based on the lower bounds in group->private_bound or group->shared_bound).
1341 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1342 * is applied to the index first.
1343 * The domain of the resulting map is "access",
1344 * while the range space is anonymous.
1346 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1347 struct cuda_array_ref_group *group)
1349 int i;
1350 isl_space *dim;
1351 isl_basic_set *bset;
1352 isl_basic_map *bmap;
1353 isl_aff *lb;
1354 isl_basic_set *offset;
1355 isl_basic_map *shift;
1356 isl_basic_map *pre_shift;
1357 isl_map *sched;
1358 const char *name;
1359 struct cuda_array_bound *bounds;
1360 int n_index = group->array->n_index;
1362 bounds = group->private_bound;
1363 if (!bounds)
1364 bounds = group->shared_bound;
1366 dim = isl_set_get_space(access);
1367 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1368 offset = isl_basic_set_universe(dim);
1369 for (i = 0; i < n_index; ++i) {
1370 lb = isl_aff_copy(bounds[i].lb);
1371 bmap = isl_basic_map_from_aff(lb);
1372 bset = isl_basic_map_range(bmap);
1373 offset = isl_basic_set_flat_product(offset, bset);
1375 offset = isl_basic_set_neg(offset);
1377 dim = isl_space_map_from_set(isl_set_get_space(access));
1378 shift = isl_basic_map_identity(dim);
1379 shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1381 bset = isl_basic_set_universe(isl_set_get_space(access));
1382 bmap = isl_basic_map_from_domain_and_range(bset, offset);
1384 shift = isl_basic_map_sum(shift, bmap);
1386 dim = isl_set_get_space(access);
1387 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1388 dim = isl_space_map_from_set(dim);
1389 pre_shift = isl_basic_map_universe(isl_space_copy(dim));
1390 dim = isl_space_add_dims(dim, isl_dim_in, 1);
1391 dim = isl_space_add_dims(dim, isl_dim_out, 1);
1392 for (i = 0; i < n_index; ++i) {
1393 if (!bounds[i].shift_map)
1394 bmap = isl_basic_map_identity(isl_space_copy(dim));
1395 else
1396 bmap = isl_basic_map_copy(bounds[i].shift_map);
1397 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1399 isl_space_free(dim);
1400 name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1401 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1402 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1403 shift = isl_basic_map_apply_range(pre_shift, shift);
1405 sched = isl_map_from_basic_map(shift);
1406 sched = isl_map_intersect_domain(sched, access);
1408 return sched;
1411 /* Construct a schedule for iterating over all elements in the given
1412 * piece of an array. The schedule iterates over a copy of the piece
1413 * that is shifted to the origin.
1414 * We subsequently also perform the tiling/wrapping over the threads.
1416 * In particular, we tile the final iterators so that the final thread
1417 * dimension runs over the final array dimension.
1418 * However, if those final iterators have only a single iteration,
1419 * we try to tile earlier iterators instead.
1421 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1422 __isl_take isl_set *access, struct cuda_array_ref_group *group)
1424 isl_space *dim;
1425 isl_map *sched;
1426 isl_union_map *usched;
1427 isl_map *tiling;
1428 isl_set *par;
1429 unsigned nvar = isl_set_dim(access, isl_dim_set);
1430 int n_tile;
1431 int first;
1433 sched = shift_access(access, group);
1435 n_tile = gen->n_block;
1436 if (n_tile > nvar) {
1437 int i;
1438 sched = isl_map_insert_dims(sched,
1439 isl_dim_out, 0, n_tile - nvar);
1440 for (i = 0; i < n_tile - nvar; ++i)
1441 sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1442 nvar = n_tile;
1445 first = nvar - n_tile;
1447 for (; first > 0; first --)
1448 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1449 first + n_tile - 1, NULL))
1450 break;
1452 dim = isl_map_get_space(sched);
1453 dim = isl_space_params(dim);
1454 if (gen->options->wrap)
1455 tiling = wrap(isl_space_copy(dim), nvar, first,
1456 n_tile, gen->block_dim);
1457 else
1458 tiling = tile(isl_space_copy(dim), nvar, first,
1459 n_tile, gen->block_dim);
1460 sched = isl_map_apply_range(sched, tiling);
1462 par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1463 usched = isl_union_map_from_map(sched);
1464 usched = isl_union_map_intersect_range(usched,
1465 isl_union_set_from_set(par));
1467 usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1468 first, n_tile);
1470 return usched;
1473 /* Print an access to the element in the global memory copy of the
1474 * given array that corresponds to the element described by "pma".
1475 * of the original array.
1476 * The copy in global memory has been linearized, so we need to take
1477 * the array size into account.
1479 static void print_global_index(FILE *out,
1480 struct cuda_array_info *array, __isl_keep isl_pw_multi_aff *pma,
1481 __isl_keep isl_set *domain)
1483 int i;
1484 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1485 isl_printer *prn;
1487 if (cuda_array_is_scalar(array)) {
1488 fprintf(out, "*%s", array->name);
1489 return;
1492 fprintf(out, "%s[", array->name);
1493 prn = isl_printer_to_file(ctx, out);
1494 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1495 for (i = 0; i + 1 < array->n_index; ++i)
1496 prn = isl_printer_print_str(prn, "(");
1497 for (i = 0; i < array->n_index; ++i) {
1498 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1499 pa = isl_pw_aff_coalesce(pa);
1500 pa = isl_pw_aff_gist(pa, isl_set_copy(domain));
1501 if (i) {
1502 prn = isl_printer_print_str(prn, ") * (");
1503 prn = isl_printer_print_pw_aff(prn,
1504 array->local_bound[i]);
1505 prn = isl_printer_print_str(prn, ") + ");
1507 prn = isl_printer_print_pw_aff(prn, pa);
1508 isl_pw_aff_free(pa);
1510 isl_printer_free(prn);
1511 fprintf(out, "]");
1514 /* Given an index expression into a tile of an array, adjust the expression
1515 * to a shift of the tile to the origin
1516 * (based on the lower bounds in array->shared_bound).
1517 * If the index is strided, then we first add
1518 * bound->shift and divide by bound->stride.
1520 static __isl_give isl_pw_aff *shift_index(__isl_take isl_pw_aff *pa,
1521 struct cuda_array_info *array,
1522 struct cuda_array_bound *bound, __isl_take isl_set *domain)
1524 isl_aff *lb;
1525 isl_pw_aff *tmp;
1527 if (bound->shift) {
1528 isl_aff *shift;
1529 shift = bound->shift;
1530 shift = isl_aff_copy(shift);
1531 shift = isl_aff_project_domain_on_params(shift);
1532 shift = isl_aff_align_params(shift, isl_pw_aff_get_space(pa));
1533 tmp = isl_pw_aff_alloc(isl_set_copy(domain), shift);
1534 pa = isl_pw_aff_add(pa, tmp);
1535 pa = isl_pw_aff_scale_down(pa, bound->stride);
1538 lb = isl_aff_copy(bound->lb);
1539 lb = isl_aff_project_domain_on_params(lb);
1541 lb = isl_aff_align_params(lb, isl_pw_aff_get_space(pa));
1543 tmp = isl_pw_aff_alloc(isl_set_copy(domain), lb);
1544 pa = isl_pw_aff_sub(pa, tmp);
1545 pa = isl_pw_aff_coalesce(pa);
1546 pa = isl_pw_aff_gist(pa, domain);
1548 return pa;
1551 /* Print an access to the element in the private/shared memory copy of the
1552 * given array reference group that corresponds to the element described
1553 * by "pma" of the original array.
1554 * Since the array in private/shared memory is just a shifted copy of part
1555 * of the original array, we simply need to subtract the lower bound,
1556 * which was computed in can_tile_for_shared_memory.
1557 * If any of the indices is strided, then we first add
1558 * bounds[i].shift and divide by bounds[i].stride.
1560 static void print_local_index(FILE *out,
1561 struct cuda_array_ref_group *group, struct cuda_array_bound *bounds,
1562 __isl_keep isl_pw_multi_aff *pma, __isl_keep isl_set *domain)
1564 int i;
1565 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1566 isl_printer *prn;
1567 struct cuda_array_info *array = group->array;
1569 print_array_name(out, group);
1570 for (i = 0; i < array->n_index; ++i) {
1571 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1573 pa = shift_index(pa, array, &bounds[i], isl_set_copy(domain));
1575 fprintf(out, "[");
1576 prn = isl_printer_to_file(ctx, out);
1577 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1578 prn = isl_printer_print_pw_aff(prn, pa);
1579 isl_printer_free(prn);
1580 fprintf(out, "]");
1581 isl_pw_aff_free(pa);
1585 /* This function is called for each leaf in the clast of the code
1586 * for copying to or from shared/private memory.
1587 * The statement name is {read,write}_{shared,private}_<array>.
1589 * The schedule iterates over the array elements, so we can use
1590 * the domain of copy_sched at the current scheduling position
1591 * as the index of the array.
1593 static void print_copy_statement(struct clast_printer_info *code,
1594 struct clast_user_stmt *u)
1596 struct cuda_gen *gen = code->user;
1597 isl_set *domain;
1598 isl_map *sched;
1599 struct cuda_array_ref_group *group = gen->copy_group;
1600 struct cuda_array_bound *bounds = gen->copy_bound;
1601 unsigned n_in;
1602 unsigned n_out;
1603 isl_space *dim;
1604 isl_set *param;
1605 isl_set *index;
1606 isl_pw_multi_aff *pma;
1607 int read;
1609 read = !strncmp(u->statement->name, "read", 4);
1611 domain = extract_host_domain(u);
1612 assert(domain);
1614 sched = isl_map_copy(gen->copy_sched);
1615 sched = isl_map_reverse(sched);
1616 sched = isl_map_intersect_domain(sched, domain);
1617 n_in = isl_map_dim(sched, isl_dim_in);
1618 n_out = isl_map_dim(sched, isl_dim_out);
1619 dim = isl_map_get_space(sched);
1620 dim = isl_space_drop_dims(dim, isl_dim_in, 0, n_in);
1621 dim = isl_space_drop_dims(dim, isl_dim_out, 0, n_out);
1622 param = parametrization(dim, n_in, 0, n_in, "c");
1623 sched = isl_map_align_params(sched, isl_set_get_space(param));
1624 sched = isl_map_intersect_domain(sched, param);
1625 index = isl_map_range(sched);
1626 domain = isl_set_copy(index);
1627 pma = isl_pw_multi_aff_from_set(index);
1628 pma = isl_pw_multi_aff_coalesce(pma);
1629 domain = isl_set_params(domain);
1631 print_indent(code->dst, code->indent);
1632 if (read) {
1633 print_local_index(code->dst, group, bounds, pma, domain);
1634 fprintf(code->dst, " = ");
1635 print_global_index(code->dst, group->array, pma, domain);
1636 } else {
1637 print_global_index(code->dst, group->array, pma, domain);
1638 fprintf(code->dst, " = ");
1639 print_local_index(code->dst, group, bounds, pma, domain);
1641 fprintf(code->dst, ";\n");
1643 isl_pw_multi_aff_free(pma);
1644 isl_set_free(domain);
1647 static void print_shared_access(struct cuda_gen *gen,
1648 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1649 const char *type, struct cuda_array_ref_group *group)
1651 const char *array_name;
1652 char *name;
1653 isl_ctx *ctx;
1654 isl_union_map *sched;
1655 unsigned nvar = isl_set_dim(access, isl_dim_set);
1656 int n_tile;
1658 ctx = isl_set_get_ctx(access);
1659 array_name = isl_set_get_tuple_name(access);
1660 name = isl_alloc_array(ctx, char,
1661 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1662 if (group->array->n_group > 1)
1663 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1664 else
1665 sprintf(name, "%s_shared_%s", type, array_name);
1666 access = isl_set_set_tuple_name(access, name);
1667 free(name);
1669 sched = access_schedule(gen, access, group);
1671 n_tile = gen->n_block;
1672 if (n_tile > nvar)
1673 n_tile = nvar;
1675 gen->copy_sched = isl_map_from_union_map(isl_union_map_copy(sched));
1676 gen->copy_group = group;
1677 gen->copy_bound = group->shared_bound;
1679 print_shared_body(gen, shared_domain, sched, nvar + n_tile,
1680 &print_copy_statement, -1);
1682 isl_union_map_free(sched);
1683 isl_map_free(gen->copy_sched);
1686 /* Return the union of all read (read = 1) and/or write (write = 1)
1687 * access relations in the group.
1689 static __isl_give isl_union_map *group_access_relation(
1690 struct cuda_array_ref_group *group, int read, int write)
1692 int i;
1693 isl_union_map *access;
1695 access = isl_union_map_empty(isl_map_get_space(group->access));
1696 for (i = 0; i < group->n_ref; ++i) {
1697 isl_map *map_i;
1699 if (!((read && group->refs[i]->read) ||
1700 (write && group->refs[i]->write)))
1701 continue;
1702 map_i = isl_map_copy(group->refs[i]->access);
1703 access = isl_union_map_union(access,
1704 isl_union_map_from_map(map_i));
1707 return access;
1710 /* Check that none of the shared memory tiles involve any strides.
1712 static int no_strides(struct cuda_array_ref_group *group)
1714 int i;
1715 int n_index = group->array->n_index;
1717 for (i = 0; i < n_index; ++i)
1718 if (group->shared_bound[i].shift)
1719 return 0;
1721 return 1;
1724 /* Return a set containing the values of the given index i
1725 * of the elements in the array tile in global memory that corresponds
1726 * to the shared memory copy.
1727 * In particular, if a is the index, we return a set with constraints
1729 * tile_offset <= a <= tile_offset + tile_size - 1
1731 * and
1733 * 0 <= a <= array_size - 1
1736 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1737 int i)
1739 isl_basic_set *tile;
1740 isl_aff *aff;
1741 isl_constraint *c;
1742 isl_local_space *ls;
1743 isl_pw_aff *bound;
1744 isl_set *dom;
1745 isl_set *tile_set;
1747 aff = isl_aff_copy(group->shared_bound[i].lb);
1748 aff = isl_aff_add_dims(aff, isl_dim_in, 1);
1749 ls = isl_aff_get_domain_local_space(aff);
1750 aff = isl_aff_neg(aff);
1751 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1752 c = isl_inequality_from_aff(isl_aff_copy(aff));
1753 tile = isl_basic_set_from_constraint(c);
1755 aff = isl_aff_neg(aff);
1756 aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1757 aff = isl_aff_add_constant_si(aff, -1);
1758 c = isl_inequality_from_aff(aff);
1759 tile = isl_basic_set_add_constraint(tile, c);
1761 aff = isl_aff_zero_on_domain(ls);
1762 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1763 c = isl_inequality_from_aff(aff);
1764 tile = isl_basic_set_add_constraint(tile, c);
1766 bound = isl_pw_aff_copy(group->array->bound[i]);
1767 bound = isl_pw_aff_add_dims(bound, isl_dim_in, 1);
1768 ls = isl_local_space_from_space(isl_pw_aff_get_domain_space(bound));
1769 aff = isl_aff_zero_on_domain(ls);
1770 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1771 aff = isl_aff_add_constant_si(aff, 1);
1772 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1774 tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1775 tile_set = isl_set_align_params(tile_set, isl_basic_set_get_space(tile));
1776 tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1778 return tile_set;
1781 /* Return a set containing the elements in the array tile in
1782 * global memory that corresponds to the shared memory copy.
1784 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1786 int i;
1787 int n_index = group->array->n_index;
1788 isl_set *tile;
1790 tile = group_tile_dim(group, 0);
1791 for (i = 1; i < n_index; ++i) {
1792 isl_set *tile_i;
1794 tile_i = group_tile_dim(group, i);
1795 tile = isl_set_flat_product(tile, tile_i);
1798 tile = isl_set_set_tuple_name(tile, group->array->name);
1800 return tile;
1803 /* Print code for reading into or writing from shared memory
1804 * the given array reference group.
1806 * sched maps the original iteration domains to the shared memory tile loops.
1808 * If we are performing a read from global memory to shared memory,
1809 * if the array involved is not a scalar and if the definition of the
1810 * shared memory tiles does not involve any strides, then we copy
1811 * the entire tile to shared memory. This may result in some extra
1812 * elements getting copied, but it should lead to simpler code
1813 * (which means that fewer registers may be needed) and less divergence.
1815 * Otherwise, we only copy the elements that will be read or have been written
1816 * in the kernel.
1818 * Note that the absence of stride requirement can easily be lifted.
1819 * We would just need to add constraints of the form
1821 * shift + a = stride * alpha
1823 static int print_group_shared_accesses(struct cuda_gen *gen,
1824 struct cuda_array_ref_group *group, const char *type,
1825 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1827 int read;
1828 isl_union_map *access;
1829 isl_union_set *uset;
1830 isl_set *access_set;
1832 if (group->private_bound)
1833 return 0;
1834 if (!group->shared_bound)
1835 return 0;
1837 read = !strcmp(type, "read");
1839 access = group_access_relation(group, read, !read);
1840 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1841 uset = isl_union_map_range(access);
1843 if (isl_union_set_is_empty(uset)) {
1844 isl_union_set_free(uset);
1845 return 0;
1848 if (read && group->array->n_index > 0 && no_strides(group)) {
1849 isl_union_set_free(uset);
1850 access_set = group_tile(group);
1851 print_shared_access(gen, shared_domain, access_set,
1852 type, group);
1853 return 1;
1856 access_set = isl_set_from_union_set(uset);
1857 access_set = isl_set_coalesce(access_set);
1859 print_shared_access(gen, shared_domain, access_set, type, group);
1861 return 1;
1864 /* Print code for reading into or writing from shared memory at
1865 * the given level (-1 for innermost).
1867 * If we are not printing at the innermost level, then the dimensionality
1868 * of shared_domain may be smaller than gen->shared_len.
1869 * As the rest of the code assumes that the domain of access has
1870 * gen->shared_len dimensions, we therefore may need to embed this domain
1871 * in a higher dimensional space after intersection with shared_domain.
1873 static void print_shared_accesses(struct cuda_gen *gen,
1874 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1875 const char *type, int level)
1877 int i, j;
1878 isl_space *dim;
1879 isl_map *proj;
1880 isl_set *par;
1881 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1882 int sync = 0;
1883 isl_union_map *sched;
1885 shared_domain = isl_set_copy(shared_domain);
1886 sched = isl_union_map_copy(gen->tiled_sched);
1887 dim = isl_union_map_get_space(sched);
1888 proj = projection(dim, gen->tiled_len, shared_len);
1889 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1890 sched = isl_union_map_intersect_range(sched,
1891 isl_union_set_from_set(isl_set_copy(shared_domain)));
1892 if (shared_len != gen->shared_len) {
1893 dim = isl_union_map_get_space(sched);
1894 proj = projection(dim, gen->shared_len, shared_len);
1895 proj = isl_map_reverse(proj);
1896 shared_domain = isl_set_apply(shared_domain,
1897 isl_map_copy(proj));
1898 sched = isl_union_map_apply_range(sched,
1899 isl_union_map_from_map(proj));
1902 dim = isl_union_map_get_space(sched);
1903 par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1904 sched = isl_union_map_intersect_range(sched,
1905 isl_union_set_from_set(par));
1907 for (i = 0; i < gen->n_array; ++i) {
1908 struct cuda_array_info *array = &gen->array[i];
1910 for (j = 0; j < array->n_group; ++j) {
1911 if (array->groups[j]->print_shared_level != level)
1912 continue;
1914 if (print_group_shared_accesses(gen, array->groups[j],
1915 type, shared_domain, sched))
1916 sync = 1;
1920 isl_union_map_free(sched);
1921 isl_set_free(shared_domain);
1923 if (sync) {
1924 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1925 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1929 /* This function is called for each access to an array in some statement
1930 * in the original code.
1931 * Replace that access by an access to shared or (linearized) global memory.
1932 * Since the array in shared memory is just
1933 * a shifted copy of part of the original array, we simply need
1934 * to subtract the lower bound, which was computed
1935 * in can_tile_for_shared_memory.
1936 * If any of the indices is strided, then we first add
1937 * shared_bound[i].shift and divide by shared_bound[i].stride.
1939 * If the given array is accessed directly from global memory,
1940 * we don't need to perform any shifting and simply simplify
1941 * expression in the context of the domain instead.
1943 * If the array space (range of access) has no name, then we are
1944 * accessing an iterator in the original program.
1946 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1947 int group_nr)
1949 int i;
1950 const char *name;
1951 unsigned n_index;
1952 struct cuda_array_info *array = NULL;
1953 isl_printer *prn;
1954 isl_pw_multi_aff *pma;
1955 isl_set *data_set;
1956 isl_set *domain;
1957 struct cuda_array_bound *bounds = NULL;
1959 access = isl_map_align_params(access,
1960 isl_set_get_space(gen->stmt_domain));
1962 data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1964 name = isl_set_get_tuple_name(data_set);
1966 if (!name)
1967 fprintf(gen->cuda.kernel_c, "(");
1968 else {
1969 struct cuda_array_ref_group *group;
1971 for (i = 0; i < gen->n_array; ++i) {
1972 if (strcmp(name, gen->array[i].name))
1973 continue;
1974 array = &gen->array[i];
1976 assert(array);
1977 group = array->groups[group_nr];
1978 bounds = group->private_bound;
1979 if (!bounds)
1980 bounds = group->shared_bound;
1982 if (!bounds && cuda_array_is_scalar(array) && !array->read_only)
1983 fprintf(gen->cuda.kernel_c, "*");
1984 print_array_name(gen->cuda.kernel_c, group);
1986 if (cuda_array_is_scalar(array)) {
1987 isl_set_free(data_set);
1988 return;
1991 fprintf(gen->cuda.kernel_c, "[");
1995 n_index = isl_set_dim(data_set, isl_dim_set);
1996 pma = isl_pw_multi_aff_from_set(data_set);
1997 pma = isl_pw_multi_aff_coalesce(pma);
1999 prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
2000 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
2002 if (!bounds)
2003 for (i = 0; i + 1 < n_index; ++i)
2004 prn = isl_printer_print_str(prn, "(");
2006 for (i = 0; i < n_index; ++i) {
2007 isl_pw_aff *index;
2009 index = isl_pw_multi_aff_get_pw_aff(pma, i);
2011 if (!array) {
2012 prn = isl_printer_print_pw_aff(prn, index);
2013 isl_pw_aff_free(index);
2014 continue;
2017 domain = isl_set_copy(gen->stmt_domain);
2018 domain = isl_set_params(domain);
2019 if (!bounds) {
2020 index = isl_pw_aff_coalesce(index);
2021 index = isl_pw_aff_gist(index, domain);
2022 } else
2023 index = shift_index(index, array, &bounds[i], domain);
2025 if (i) {
2026 if (!bounds) {
2027 prn = isl_printer_print_str(prn, ") * (");
2028 prn = isl_printer_print_pw_aff(prn,
2029 array->local_bound[i]);
2030 prn = isl_printer_print_str(prn, ") + ");
2031 } else
2032 prn = isl_printer_print_str(prn, "][");
2034 prn = isl_printer_print_pw_aff(prn, index);
2035 isl_pw_aff_free(index);
2037 if (!name)
2038 prn = isl_printer_print_str(prn, ")");
2039 else
2040 prn = isl_printer_print_str(prn, "]");
2041 isl_printer_free(prn);
2043 isl_pw_multi_aff_free(pma);
2046 struct cuda_access_print_info {
2047 struct cuda_gen *gen;
2048 struct cuda_stmt_access *access;
2051 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
2052 * with the pet printer. This means that whenever the pet printer prints a
2053 * pet access expression we have the corresponding cuda access available and can
2054 * print the modified access.
2056 static void print_cuda_access(struct pet_expr *expr, void *usr)
2058 struct cuda_access_print_info *info =
2059 (struct cuda_access_print_info *) usr;
2060 print_access(info->gen, isl_map_copy(info->access->access),
2061 info->access->group);
2062 info->access = info->access->next;
2065 static void print_stmt_body(struct cuda_gen *gen,
2066 FILE *out, struct cuda_stmt *stmt)
2068 struct cuda_access_print_info info;
2070 info.gen = gen;
2071 info.access = stmt->accesses;
2073 print_pet_expr(out, stmt->body, print_cuda_access, &info);
2074 fprintf(out, ";\n");
2077 /* This function is called for each leaf in the innermost clast,
2078 * i.e., for each statement.
2079 * We print the statement body, simplifying the accesses based
2080 * on the schedule.
2082 static void print_statement(struct clast_printer_info *code,
2083 struct clast_user_stmt *u)
2085 struct cuda_gen *gen = code->user;
2086 isl_space *dim;
2087 isl_set *par;
2088 isl_set *stmt_domain;
2089 isl_union_map *stmt_sched;
2090 isl_union_set *uset;
2091 int nr;
2092 struct cuda_stmt *stmt;
2094 nr = atoi(u->statement->name + 2);
2095 stmt = &gen->stmts[nr];
2097 stmt_domain = extract_host_domain(u);
2099 stmt_sched = isl_union_map_intersect_range(
2100 isl_union_map_copy(gen->local_sched),
2101 isl_union_set_from_set(extend(stmt_domain,
2102 gen->thread_tiled_len)));
2103 dim = isl_union_map_get_space(stmt_sched);
2104 par = parametrization(dim, gen->thread_tiled_len, 0,
2105 gen->thread_tiled_len, "c");
2106 stmt_sched = isl_union_map_intersect_range(stmt_sched,
2107 isl_union_set_from_set(par));
2109 uset = isl_union_map_domain(stmt_sched);
2110 dim = isl_union_set_get_space(uset);
2111 dim = isl_space_add_dims(dim, isl_dim_set,
2112 isl_set_dim(stmt->domain, isl_dim_set));
2113 dim = isl_space_set_tuple_name(dim, isl_dim_set, u->statement->name);
2114 gen->stmt_domain = isl_union_set_extract_set(uset, dim);
2115 isl_union_set_free(uset);
2117 print_indent(code->dst, code->indent);
2118 print_stmt_body(gen, code->dst, stmt);
2120 isl_set_free(gen->stmt_domain);
2123 static void print_private_access(struct cuda_gen *gen,
2124 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
2125 const char *type, struct cuda_array_ref_group *group)
2127 const char *array_name;
2128 char *name;
2129 isl_ctx *ctx;
2130 unsigned nvar = isl_set_dim(access, isl_dim_set);
2131 isl_union_map *usched;
2133 if (isl_set_fast_is_empty(access)) {
2134 isl_set_free(access);
2135 return;
2138 ctx = isl_set_get_ctx(access);
2139 array_name = isl_set_get_tuple_name(access);
2140 name = isl_alloc_array(ctx, char,
2141 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
2142 if (group->array->n_group > 1)
2143 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
2144 else
2145 sprintf(name, "%s_private_%s", type, array_name);
2146 access = isl_set_set_tuple_name(access, name);
2147 free(name);
2149 gen->copy_sched = shift_access(access, group);
2150 gen->copy_group = group;
2151 gen->copy_bound = group->private_bound;
2153 usched = isl_union_map_from_map(isl_map_copy(gen->copy_sched));
2154 print_shared_body(gen, shared_domain, usched, nvar,
2155 &print_copy_statement, 1);
2156 isl_union_map_free(usched);
2158 isl_map_free(gen->copy_sched);
2161 /* Print code for reading into or writing from private memory
2162 * the given array reference group.
2164 * sched maps the original iteration domains to the shared memory tile loops.
2166 static void print_group_private_accesses(struct cuda_gen *gen,
2167 struct cuda_array_ref_group *group,
2168 const char *type, __isl_keep isl_set *shared_domain,
2169 unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2171 int read;
2172 isl_union_map *access;
2173 isl_union_set *uset;
2174 isl_set *access_set;
2176 if (!group->private_bound)
2177 return;
2179 read = !strcmp(type, "read");
2181 access = group_access_relation(group, read, !read);
2182 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2183 access = isl_union_map_intersect(access,
2184 isl_union_map_copy(gen->private_access));
2185 uset = isl_union_map_range(access);
2187 if (isl_union_set_is_empty(uset)) {
2188 isl_union_set_free(uset);
2189 return;
2192 access_set = isl_set_from_union_set(uset);
2193 access_set = isl_set_coalesce(access_set);
2194 access_set = isl_set_eliminate(access_set, isl_dim_param,
2195 first_shared + shared_len,
2196 gen->shared_len - shared_len);
2198 print_private_access(gen, shared_domain, access_set, type, group);
2201 /* Print code for reading into or writing from private memory at
2202 * the given level (-1 for innermost).
2204 * If we are not printing at the innermost level, then the dimensionality
2205 * of shared_domain may be smaller than gen->shared_len.
2206 * As the rest of the code assumes that the domain of access has
2207 * gen->shared_len dimensions, we therefore may need to embed this domain
2208 * in a higher dimensional space after intersection with shared_domain.
2210 * This code is very similar to print_shared_accesses.
2211 * The main difference is that we to take into account gen->private_access.
2213 static void print_private_accesses(struct cuda_gen *gen,
2214 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2215 const char *type, int level)
2217 int i, j;
2218 isl_space *dim;
2219 isl_map *proj;
2220 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2221 unsigned first_shared;
2222 isl_union_map *sched;
2224 shared_domain = isl_set_copy(shared_domain);
2225 sched = isl_union_map_copy(gen->tiled_sched);
2226 dim = isl_union_map_get_space(sched);
2227 first_shared = isl_space_dim(dim, isl_dim_param);
2228 proj = projection(dim, gen->tiled_len, shared_len);
2229 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2230 sched = isl_union_map_intersect_range(sched,
2231 isl_union_set_from_set(isl_set_copy(shared_domain)));
2232 if (shared_len != gen->shared_len) {
2233 dim = isl_union_map_get_space(sched);
2234 proj = projection(dim, gen->shared_len, shared_len);
2235 proj = isl_map_reverse(proj);
2236 shared_domain = isl_set_apply(shared_domain,
2237 isl_map_copy(proj));
2238 sched = isl_union_map_apply_range(sched,
2239 isl_union_map_from_map(proj));
2242 for (i = 0; i < gen->n_array; ++i) {
2243 struct cuda_array_info *array = &gen->array[i];
2245 for (j = 0; j < array->n_group; ++j) {
2246 if (array->groups[j]->print_shared_level != level)
2247 continue;
2249 print_group_private_accesses(gen, array->groups[j],
2250 type, shared_domain,
2251 first_shared, shared_len, sched);
2255 isl_union_map_free(sched);
2256 isl_set_free(shared_domain);
2259 /* Set unroll[j] if the input dimension j is involved in
2260 * the index expression represented by bmap.
2262 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2264 int i, j;
2265 int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2266 int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2267 int *unroll = user;
2269 for (i = 0; i < n_out; ++i) {
2270 isl_constraint *c;
2271 int ok;
2273 ok = isl_basic_map_has_defining_equality(bmap,
2274 isl_dim_out, i, &c);
2275 assert(ok);
2276 for (j = 0; j < n_in; ++j)
2277 if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2278 unroll[j] = 1;
2279 isl_constraint_free(c);
2282 isl_basic_map_free(bmap);
2283 return 0;
2286 /* Given an array pos mapping input dimensions to the corresponding
2287 * output dimension, construct the corresponding map.
2289 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
2290 int *pos, int len)
2292 int i;
2293 isl_constraint *c;
2294 isl_basic_map *bmap;
2295 isl_local_space *ls;
2297 dim = isl_space_add_dims(dim, isl_dim_in, len);
2298 dim = isl_space_add_dims(dim, isl_dim_out, len);
2299 bmap = isl_basic_map_universe(isl_space_copy(dim));
2300 ls = isl_local_space_from_space(dim);
2302 for (i = 0; i < len; ++i) {
2303 c = isl_equality_alloc(isl_local_space_copy(ls));
2304 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2305 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2306 bmap = isl_basic_map_add_constraint(bmap, c);
2308 isl_local_space_free(ls);
2310 return isl_map_from_basic_map(bmap);
2313 /* Find all loops involved in any of the index expressions for any of
2314 * the private accesses, move them innermost and then mark them as
2315 * requiring unrolling by setting gen->first_unroll.
2316 * The loops involved should all be parallel because of the checks
2317 * we performed in check_private_group_access. Moving them innermost
2318 * is therefore a valid transformation.
2320 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2321 __isl_take isl_union_map *sched)
2323 int i, j;
2324 int unroll[gen->thread_tiled_len];
2325 int perm[gen->thread_tiled_len];
2326 isl_space *dim;
2327 isl_map *permute;
2328 int len = gen->shared_len + gen->n_parallel + gen->n_block;
2330 gen->first_unroll = -1;
2332 for (i = 0; i < gen->thread_tiled_len; ++i)
2333 unroll[i] = 0;
2334 for (i = 0; i < gen->n_array; ++i) {
2335 struct cuda_array_info *array = &gen->array[i];
2337 for (j = 0; j < array->n_group; ++j) {
2338 isl_union_map *access;
2339 isl_map *acc;
2341 if (!array->groups[j]->private_bound)
2342 continue;
2344 access = group_access_relation(array->groups[j], 1, 1);
2345 access = isl_union_map_apply_domain(access,
2346 isl_union_map_copy(sched));
2348 acc = isl_map_from_union_map(access);
2349 isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2351 isl_map_free(acc);
2355 for (i = 0; i < gen->shared_len; ++i)
2356 if (unroll[i])
2357 return sched;
2359 for (i = gen->shared_len; i < len; ++i)
2360 if (unroll[i])
2361 break;
2363 if (i >= len)
2364 return sched;
2366 for (i = len; i < gen->thread_tiled_len; ++i)
2367 if (unroll[i])
2368 return sched;
2370 j = 0;
2371 for (i = 0; i < gen->thread_tiled_len; ++i)
2372 if (!unroll[i])
2373 perm[i] = j++;
2374 gen->first_unroll = 1 + j;
2375 for (i = 0; i < len; ++i)
2376 if (unroll[i])
2377 perm[i] = j++;
2379 dim = isl_union_map_get_space(sched);
2380 permute = permutation(dim, perm, gen->thread_tiled_len);
2381 sched = isl_union_map_apply_range(sched,
2382 isl_union_map_from_map(permute));
2384 return sched;
2387 /* This function is called for each leaf in the clast of the kernel code.
2388 * We first specialize the schedule to the site of the leaf and
2389 * print code for reading into shared memory, performing the actual
2390 * computations and writing from shared memory, with the required
2391 * synchronizations.
2393 static void print_kernel_user(struct clast_printer_info *code,
2394 struct clast_user_stmt *u)
2396 struct cuda_gen *gen = code->user;
2397 isl_set *shared_domain;
2399 shared_domain = extract_entire_host_domain(&u->stmt);
2401 print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2403 print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2405 print_shared_body(gen, shared_domain, gen->local_sched,
2406 gen->thread_tiled_len, &print_statement,
2407 gen->first_unroll);
2409 print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2411 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2412 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2414 print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2416 isl_set_free(shared_domain);
2419 /* Check if we need to perform any copying to shared memory at this level
2420 * and if so, print the copying instructions.
2421 * Any array for which we are allowed to print copying instructions at
2422 * this level, but haven't done so already, is printed.
2424 static void copy_to_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2426 int i, j;
2427 int level;
2428 int print = 0;
2430 level = isl_set_dim(domain, isl_dim_set);
2432 for (i = 0; i < gen->n_array; ++i) {
2433 struct cuda_array_info *array = &gen->array[i];
2435 for (j = 0; j < array->n_group; ++j) {
2436 if (array->groups[j]->print_shared_level >= 0)
2437 continue;
2438 if (array->groups[j]->last_shared >= level)
2439 continue;
2440 array->groups[j]->print_shared_level = level;
2441 print = 1;
2445 if (print) {
2446 print_shared_accesses(gen, domain, gen->read, "read", level);
2447 print_private_accesses(gen, domain, gen->read, "read", level);
2452 /* This function is called for each for loop in the clast,
2453 * right after the opening brace has been printed.
2455 * Print copying instructions to shared or private memory if needed.
2457 static void print_kernel_for_head(struct clast_printer_info *code,
2458 struct clast_for *f)
2460 struct cuda_gen *gen = code->user;
2461 isl_set *domain;
2463 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2464 copy_to_local(gen, domain);
2466 isl_set_free(domain);
2469 /* Print instructions for copying from shared memory for each array
2470 * for which print_kernel_for_head has added copying instructions
2471 * to shared memory.
2473 static void copy_from_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2475 int i, j;
2476 int level;
2477 int print = 0;
2479 level = isl_set_dim(domain, isl_dim_set);
2481 for (i = 0; i < gen->n_array; ++i) {
2482 struct cuda_array_info *array = &gen->array[i];
2484 for (j = 0; j < array->n_group; ++j) {
2485 if (array->groups[j]->print_shared_level != level)
2486 continue;
2487 print = 1;
2488 break;
2490 if (print)
2491 break;
2494 if (print) {
2495 print_private_accesses(gen, domain, gen->write, "write", level);
2496 print_shared_accesses(gen, domain, gen->write, "write", level);
2500 /* This function is called for each for loop in the clast,
2501 * right before the closing brace is printed.
2503 * Print copying instructions from shared or private memory if needed.
2505 static void print_kernel_for_foot(struct clast_printer_info *code,
2506 struct clast_for *f)
2508 struct cuda_gen *gen = code->user;
2509 isl_set *domain;
2511 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2512 copy_from_local(gen, domain);
2514 isl_set_free(domain);
2517 /* Use CLooG to generate code for the outer gen->shared_first loops
2518 * of the local schedule "sched".
2519 * The pretty printing of this code is handled by print_clast,
2520 * which calls print_kernel_user for each iteration of the shared tile loops.
2522 static void print_cloog_kernel_body(struct cuda_gen *gen,
2523 __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2525 int i;
2526 CloogOptions *options;
2527 CloogDomain *cloog_context;
2528 CloogUnionDomain *ud;
2529 CloogInput *input;
2530 struct clast_stmt *stmt;
2531 char name[20];
2533 sched = isl_union_map_copy(sched);
2534 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
2536 options = cloog_options_malloc(gen->state);
2537 options->language = CLOOG_LANGUAGE_C;
2538 options->strides = 1;
2539 options->sh = 1;
2540 options->stop = gen->shared_len;
2541 options->f = gen->tiled_len;
2542 options->l = gen->tiled_len;
2543 options->save_domains = 1;
2544 options->noscalars = 1;
2546 ud = cloog_union_domain_from_isl_union_map(sched);
2547 for (i = 0; i < gen->shared_len; ++i) {
2548 snprintf(name, sizeof(name), "g%d", i);
2549 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2551 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2552 input = cloog_input_alloc(cloog_context, ud);
2554 stmt = cloog_clast_create_from_input(input, options);
2556 gen->kernel_code.indent = 4;
2557 gen->kernel_code.dst = gen->cuda.kernel_c;
2558 gen->kernel_code.print_user_stmt = NULL;
2559 gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2560 gen->kernel_code.print_for_head = &print_kernel_for_head;
2561 gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2562 gen->kernel_code.user = gen;
2563 copy_to_local(gen, context);
2564 print_clast(&gen->kernel_code, stmt);
2565 copy_from_local(gen, context);
2567 cloog_clast_free(stmt);
2568 cloog_options_free(options);
2571 static void print_kernel_iterators(struct cuda_gen *gen)
2573 int i;
2574 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2575 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2576 "threadIdx.z" };
2578 if (gen->n_grid > 0) {
2579 print_indent(gen->cuda.kernel_c, 4);
2580 fprintf(gen->cuda.kernel_c, "int ");
2581 for (i = 0; i < gen->n_grid; ++i) {
2582 if (i)
2583 fprintf(gen->cuda.kernel_c, ", ");
2584 fprintf(gen->cuda.kernel_c, "b%d = %s",
2585 i, block_dims[gen->n_grid - 1 - i]);
2587 fprintf(gen->cuda.kernel_c, ";\n");
2590 if (gen->n_block > 0) {
2591 print_indent(gen->cuda.kernel_c, 4);
2592 fprintf(gen->cuda.kernel_c, "int ");
2593 for (i = 0; i < gen->n_block; ++i) {
2594 if (i)
2595 fprintf(gen->cuda.kernel_c, ", ");
2596 fprintf(gen->cuda.kernel_c, "t%d = %s",
2597 i, thread_dims[gen->n_block - 1 - i]);
2599 fprintf(gen->cuda.kernel_c, ";\n");
2603 static void print_group_shared_array(struct cuda_gen *gen,
2604 struct cuda_array_ref_group *group)
2606 int j;
2607 struct cuda_array_bound *bounds;
2609 bounds = group->private_bound;
2610 if (!bounds)
2611 bounds = group->shared_bound;
2612 if (!bounds)
2613 return;
2615 print_indent(gen->cuda.kernel_c, 4);
2616 fprintf(gen->cuda.kernel_c, "%s%s ",
2617 group->private_bound ? "" : "__shared__ ", group->array->type);
2618 print_array_name(gen->cuda.kernel_c, group);
2619 for (j = 0; j < group->array->n_index; ++j) {
2620 fprintf(gen->cuda.kernel_c, "[");
2621 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2622 fprintf(gen->cuda.kernel_c, "]");
2624 fprintf(gen->cuda.kernel_c, ";\n");
2627 static void print_shared_arrays(struct cuda_gen *gen)
2629 int i, j;
2631 for (i = 0; i < gen->n_array; ++i) {
2632 struct cuda_array_info *array = &gen->array[i];
2634 for (j = 0; j < array->n_group; ++j)
2635 print_group_shared_array(gen, array->groups[j]);
2639 static void print_kernel_body(struct cuda_gen *gen,
2640 __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2642 isl_set *context;
2644 context = isl_set_copy(host_domain);
2645 context = parametrize(context, 0, gen->tile_first, "h");
2646 context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2647 context = add_bounded_parameters(context,
2648 gen->n_grid, gen->grid_dim, "b");
2650 print_kernel_iterators(gen);
2651 print_shared_arrays(gen);
2653 fprintf(gen->cuda.kernel_c, "\n");
2655 print_cloog_kernel_body(gen, context, sched);
2657 isl_set_free(context);
2660 /* Given a constraint
2662 * a(p,i) + j = g f(e)
2664 * or -a(p,i) - j = g f(e) if sign < 0,
2665 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2666 * a(p,i) is assumed to be an expression in only the parameters.
2668 static void extract_stride(__isl_keep isl_constraint *c,
2669 struct cuda_array_bound *bound, isl_int stride, int sign)
2671 int i;
2672 isl_int v;
2673 isl_space *dim;
2674 unsigned nparam;
2675 isl_aff *aff;
2677 isl_int_set(bound->stride, stride);
2679 dim = isl_constraint_get_space(c);
2680 dim = isl_space_params(dim);
2682 nparam = isl_space_dim(dim, isl_dim_param);
2684 isl_int_init(v);
2686 isl_constraint_get_constant(c, &v);
2687 if (sign < 0)
2688 isl_int_neg(v, v);
2689 aff = isl_aff_zero_on_domain(isl_local_space_from_space(dim));
2690 aff = isl_aff_set_constant(aff, v);
2692 for (i = 0; i < nparam; ++i) {
2693 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2694 if (isl_int_is_zero(v))
2695 continue;
2696 if (sign < 0)
2697 isl_int_neg(v, v);
2698 aff = isl_aff_add_coefficient(aff, isl_dim_param, i, v);
2701 isl_int_clear(v);
2703 bound->shift = aff;
2706 /* Given an equality constraint of a map with a single output dimension j,
2707 * check if the constraint is of the form
2709 * a(p,i) + j = g f(e)
2711 * with a(p,i) an expression in the parameters and input dimensions
2712 * and f(e) an expression in the existentially quantified variables.
2713 * If so, and if g is larger than any such g from a previously considered
2714 * constraint, then call extract_stride. to record the stride information
2715 * in bound.
2717 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2719 int i;
2720 isl_int v, stride;
2721 unsigned n_div;
2722 struct cuda_array_bound *bound = user;
2724 isl_int_init(v);
2725 isl_int_init(stride);
2727 n_div = isl_constraint_dim(c, isl_dim_div);
2728 isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2730 if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2731 int s = isl_int_sgn(v);
2732 isl_int_set_si(stride, 0);
2733 for (i = 0; i < n_div; ++i) {
2734 isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2735 isl_int_gcd(stride, stride, v);
2737 if (!isl_int_is_zero(stride) &&
2738 isl_int_gt(stride, bound->stride))
2739 extract_stride(c, bound, stride, s);
2742 isl_int_clear(stride);
2743 isl_int_clear(v);
2745 isl_constraint_free(c);
2746 return 0;
2749 /* Given contraints on an array index i, check if we can find
2750 * a shift a(p) and a stride g such that
2752 * a(p) + i = 0 mod g
2754 * If so, record the information in bound and apply the mapping
2755 * i -> (i + a(p))/g to the array index in bounds and return
2756 * the new constraints.
2757 * If not, simply return the original constraints.
2759 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2760 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2762 isl_basic_map *aff;
2763 isl_basic_map *shift;
2764 isl_aff *aff_shift;
2766 isl_int_set_si(bound->stride, -1);
2768 aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2770 isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2772 isl_basic_map_free(aff);
2774 if (isl_int_is_neg(bound->stride))
2775 return bounds;
2777 aff_shift = isl_aff_copy(bound->shift);
2778 aff_shift = isl_aff_add_dims(aff_shift, isl_dim_in, 1);
2779 aff_shift = isl_aff_add_coefficient_si(aff_shift, isl_dim_in, 0, 1);
2780 aff_shift = isl_aff_scale_down(aff_shift, bound->stride);
2781 shift = isl_basic_map_from_aff(aff_shift);
2783 bound->shift_map = isl_basic_map_copy(shift);
2784 bounds = isl_basic_map_apply_range(bounds, shift);
2786 return bounds;
2789 struct cuda_size_info {
2790 isl_basic_set *bset;
2791 struct cuda_array_bound *bound;
2792 int pos;
2795 /* Given a constraint from the basic set describing the bounds on
2796 * an array index, check if it is a lower bound, say m i >= b(x), and,
2797 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2798 * upper bound. If so, and if this bound is smaller than any bound
2799 * derived from earlier constraints, set the size to this bound on
2800 * the expression and the lower bound to ceil(b(x)/m).
2802 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2804 struct cuda_size_info *size = user;
2805 unsigned nparam;
2806 unsigned n_div;
2807 isl_int v;
2809 nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2810 n_div = isl_constraint_dim(c, isl_dim_div);
2812 if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2813 isl_constraint_free(c);
2814 return 0;
2817 isl_int_init(v);
2819 isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2821 if (isl_int_is_pos(v)) {
2822 isl_aff *aff;
2823 isl_aff *lb;
2824 enum isl_lp_result res;
2826 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2827 aff = isl_aff_ceil(aff);
2829 lb = isl_aff_copy(aff);
2831 aff = isl_aff_neg(aff);
2832 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2834 res = isl_basic_set_max(size->bset, aff, &v);
2835 isl_aff_free(aff);
2837 if (res == isl_lp_ok) {
2838 isl_int_add_ui(v, v, 1);
2839 if (isl_int_is_neg(size->bound->size) ||
2840 isl_int_lt(v, size->bound->size)) {
2841 isl_int_set(size->bound->size, v);
2842 lb = isl_aff_drop_dims(lb, isl_dim_in,
2843 0, size->pos + 1);
2844 isl_aff_free(size->bound->lb);
2845 size->bound->lb = isl_aff_copy(lb);
2848 isl_aff_free(lb);
2851 isl_int_clear(v);
2852 isl_constraint_free(c);
2854 return 0;
2857 /* Given a basic map "bounds" that maps parameters and input dimensions
2858 * to a single output dimension, look for an expression in the parameters
2859 * and input dimensions such that the range of the output dimension shifted
2860 * by this expression is a constant.
2862 * In particular, we currently only consider lower bounds on the output
2863 * dimension as candidate expressions.
2865 static int compute_array_dim_size(struct cuda_gen *gen,
2866 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2868 struct cuda_size_info size;
2870 bounds = isl_basic_map_detect_equalities(bounds);
2871 bounds = check_stride(gen, bound, bounds);
2873 isl_int_set_si(bound->size, -1);
2874 bound->lb = NULL;
2876 size.bound = bound;
2877 size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2878 size.bset = isl_basic_map_wrap(bounds);
2879 size.bset = isl_basic_set_flatten(size.bset);
2880 size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2881 isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2882 &size);
2883 isl_basic_set_free(size.bset);
2885 return isl_int_is_nonneg(bound->size) ? 0 : -1;
2888 /* Check if we can find a shared memory tile for the given array
2889 * based on the given accesses, and if so, put the results
2890 * in array->shared_bound.
2892 * We project the accesses on each index in turn and look for a parametric
2893 * offset such that the size is constant.
2895 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2896 struct cuda_array_info *array, __isl_keep isl_map *access,
2897 struct cuda_array_bound *bounds)
2899 int i;
2901 for (i = 0; i < array->n_index; ++i) {
2902 isl_map *access_i;
2903 isl_basic_map *hull;
2905 access_i = isl_map_copy(access);
2906 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2907 access_i = isl_map_project_out(access_i, isl_dim_out,
2908 1, array->n_index - (i + 1));
2909 access_i = isl_map_compute_divs(access_i);
2910 hull = isl_map_simple_hull(access_i);
2911 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2912 return 0;
2915 return 1;
2918 /* Construct a map with input the shared tile loops and the loops that
2919 * will be wrapped around the threads that relates these later loops
2920 * to the thread indices and then projects them out.
2922 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2924 isl_map *priv;
2925 isl_map *tiling;
2926 isl_map *proj;
2927 isl_set *par;
2928 isl_space *dim;
2930 dim = isl_union_map_get_space(gen->shared_sched);
2932 if (gen->options->wrap)
2933 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2934 gen->shared_len, gen->n_block, gen->block_dim);
2935 else
2936 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2937 gen->shared_len, gen->n_block, gen->block_dim);
2939 priv = tiling;
2941 par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2942 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2943 gen->n_block, "t");
2945 priv = isl_map_align_params(priv, isl_set_get_space(par));
2946 priv = isl_map_intersect_range(priv, par);
2948 dim = isl_map_get_space(priv);
2949 dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2950 dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2951 proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2952 gen->shared_len);
2954 priv = isl_map_apply_range(priv, proj);
2956 return priv;
2959 /* Construct a map from domain_dim to domain_dim that increments
2960 * the dimension at position "pos" and leaves all other dimensions
2961 * constant.
2963 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2965 int i;
2966 int len = isl_space_dim(domain_dim, isl_dim_set);
2967 isl_space *dim;
2968 isl_basic_map *next;
2969 isl_local_space *ls;
2971 dim = isl_space_map_from_set(domain_dim);
2972 next = isl_basic_map_universe(isl_space_copy(dim));
2973 ls = isl_local_space_from_space(dim);
2975 for (i = 0; i < len; ++i) {
2976 isl_constraint *c;
2978 c = isl_equality_alloc(isl_local_space_copy(ls));
2979 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2980 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2981 if (i == pos)
2982 isl_constraint_set_constant_si(c, 1);
2983 next = isl_basic_map_add_constraint(next, c);
2986 isl_local_space_free(ls);
2988 return isl_map_from_basic_map(next);
2991 /* Check if the given access is coalesced.
2992 * That is, check whether incrementing the dimension that will get
2993 * wrapped over the last thread index results in incrementing
2994 * the last array index.
2996 * This function is only called for access relations without reuse.
2998 static int access_is_coalesced(struct cuda_gen *gen,
2999 __isl_keep isl_union_map *access)
3001 isl_space *dim;
3002 isl_map *access_map;
3003 isl_map *next_thread_x;
3004 isl_map *next_element;
3005 isl_map *map;
3006 int coalesced;
3008 access = isl_union_map_copy(access);
3009 access = isl_union_map_apply_domain(access,
3010 isl_union_map_copy(gen->tiled_sched));
3011 access_map = isl_map_from_union_map(access);
3013 dim = isl_map_get_space(access_map);
3014 dim = isl_space_domain(dim);
3015 next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
3017 dim = isl_map_get_space(access_map);
3018 dim = isl_space_range(dim);
3019 next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
3021 map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
3022 map = isl_map_apply_range(map, access_map);
3024 coalesced = isl_map_is_subset(map, next_element);
3026 isl_map_free(next_element);
3027 isl_map_free(map);
3029 return coalesced;
3032 /* For the given array reference group, check whether the access is private
3033 * to the thread. That is, check that any given array element
3034 * is only accessed by a single thread.
3035 * We compute an access relation that maps the shared tile loop iterators
3036 * and the shared point loop iterators that will be wrapped over the
3037 * threads to the array elements.
3038 * We actually check that those iterators that will be wrapped
3039 * partition the array space. This check is stricter than necessary
3040 * since several iterations may be mapped onto the same thread
3041 * and then they could be allowed to access the same memory elements,
3042 * but our check does not allow this situation.
3044 * We also check that the index expression only depends on parallel
3045 * loops. That way, we can move those loops innermost and unroll them.
3046 * Again, we use a test that is stricter than necessary.
3047 * We actually check whether the index expression only depends
3048 * on the iterators that are wrapped over the threads.
3049 * These are necessarily parallel, but there may be more parallel loops.
3051 * Combining the injectivity of the first test with the single-valuedness
3052 * of the second test, we simply test for bijectivity.
3054 * If it turns out we can use registers, we compute the private memory
3055 * tile size using can_tile_for_shared_memory, after introducing a dependence
3056 * on the thread indices.
3058 * Before performing any of the above computations, we first check
3059 * if there is any reuse on the reference group. If not, we simply
3060 * return. If, moreover, the access is coalesced then we also remove
3061 * the shared memory tiling since we should just use global memory instead.
3063 static void check_private_group_access(struct cuda_gen *gen,
3064 struct cuda_array_ref_group *group)
3066 isl_map *acc;
3067 isl_union_map *access;
3068 int n_index = group->array->n_index;
3070 access = group_access_relation(group, 1, 1);
3071 if (isl_union_map_is_injective(access)) {
3072 if (group->shared_bound && access_is_coalesced(gen, access)) {
3073 free_bound_list(group->shared_bound, n_index);
3074 group->shared_bound = NULL;
3076 isl_union_map_free(access);
3077 return;
3079 access = isl_union_map_apply_domain(access,
3080 isl_union_map_copy(gen->shared_sched));
3082 acc = isl_map_from_union_map(access);
3084 if (!isl_map_is_bijective(acc)) {
3085 isl_map_free(acc);
3086 return;
3089 group->private_bound = create_bound_list(gen->ctx, n_index);
3090 acc = isl_map_align_params(acc, isl_map_get_space(gen->privatization));
3091 acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
3092 if (!can_tile_for_shared_memory(gen, group->array, acc,
3093 group->private_bound)) {
3094 free_bound_list(group->private_bound, n_index);
3095 group->private_bound = NULL;
3098 isl_map_free(acc);
3101 /* Look for the last shared tile loop that affects the offset of the
3102 * shared or private tile and store the result in array->last_shared.
3104 static void set_last_shared(struct cuda_gen *gen,
3105 struct cuda_array_ref_group *group)
3107 int i, j;
3108 struct cuda_array_bound *bounds;
3109 unsigned first_shared = gen->first_shared;
3110 int n_index = group->array->n_index;
3112 bounds = group->private_bound;
3113 if (!bounds)
3114 bounds = group->shared_bound;
3115 if (!bounds)
3116 return;
3118 for (j = gen->shared_len - 1; j >= 0; --j) {
3119 for (i = 0; i < n_index; ++i) {
3120 isl_aff *lb;
3121 isl_aff *shift;
3123 lb = bounds[i].lb;
3124 if (isl_aff_involves_dims(lb, isl_dim_param,
3125 first_shared + j, 1))
3126 break;
3128 shift = bounds[i].shift;
3129 if (!shift)
3130 continue;
3131 if (isl_aff_involves_dims(shift, isl_dim_param,
3132 first_shared + j, 1))
3133 break;
3135 if (i < n_index)
3136 break;
3138 group->last_shared = j;
3141 /* Compute the sizes of all private arrays for the current kernel,
3142 * as well as the offsets of the private pieces in the original arrays.
3143 * If we cannot or don't want to privatize a given array group,
3144 * we use the shared memory tile sizes computed in
3145 * compute_group_shared_bound instead.
3147 * If we have been able to find a private or shared tile,
3148 * we also look for the last shared tile loop that affects the offset
3149 * (and therefore the group tile) and store the result in group->last_shared.
3151 * A privatized copy of all access relations from reference groups that
3152 * are mapped to private memory is stored in gen->privatization.
3154 static void compute_private_size(struct cuda_gen *gen)
3156 int i, j;
3157 isl_union_map *private;
3159 if (!gen->options->use_private_memory)
3160 return;
3162 private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
3164 for (i = 0; i < gen->n_array; ++i) {
3165 struct cuda_array_info *array = &gen->array[i];
3167 for (j = 0; j < array->n_group; ++j) {
3168 check_private_group_access(gen, array->groups[j]);
3170 if (!array->groups[j]->private_bound)
3171 continue;
3173 private = isl_union_map_union(private,
3174 group_access_relation(array->groups[j], 1, 1));
3177 for (j = 0; j < array->n_group; ++j) {
3178 array->groups[j]->last_shared = gen->shared_len - 1;
3179 array->groups[j]->print_shared_level = -1;
3180 set_last_shared(gen, array->groups[j]);
3184 if (isl_union_map_is_empty(private))
3185 isl_union_map_free(private);
3186 else {
3187 isl_union_map *priv;
3189 private = isl_union_map_apply_domain(private,
3190 isl_union_map_copy(gen->shared_sched));
3191 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3192 private = isl_union_map_apply_domain(private, priv);
3193 gen->private_access = private;
3197 /* Compute the size of the tile specified by the list "bound" of n_index
3198 * cuda_array_bounds in number of elements and put the result in *size.
3200 static void tile_size(unsigned n_index, struct cuda_array_bound *bound,
3201 isl_int *size)
3203 int i;
3205 isl_int_set_si(*size, 1);
3207 for (i = 0; i < n_index; ++i)
3208 isl_int_mul(*size, *size, bound[i].size);
3211 /* If max_shared_memory is not set to infinity (-1), then make
3212 * sure that the total amount of shared memory required by the
3213 * array reference groups mapped to shared memory is no larger
3214 * than this maximum.
3216 * We apply a greedy approach and discard (keep in global memory)
3217 * those groups that would result in a total memory size that
3218 * is larger than the maximum.
3220 static void check_shared_memory_bound(struct cuda_gen *gen)
3222 int i, j;
3223 isl_int left, size;
3225 if (gen->options->max_shared_memory < 0)
3226 return;
3228 isl_int_init(left);
3229 isl_int_init(size);
3230 isl_int_set_si(left, gen->options->max_shared_memory);
3232 for (i = 0; i < gen->n_array; ++i) {
3233 struct cuda_array_info *array = &gen->array[i];
3235 for (j = 0; j < array->n_group; ++j) {
3236 struct cuda_array_ref_group *group;
3238 group = array->groups[j];
3239 if (!group->shared_bound)
3240 continue;
3242 tile_size(array->n_index, group->shared_bound, &size);
3243 isl_int_mul_ui(size, size, array->size);
3245 if (isl_int_le(size, left)) {
3246 isl_int_sub(left, left, size);
3247 continue;
3250 free_bound_list(group->shared_bound, array->n_index);
3251 group->shared_bound = NULL;
3255 isl_int_clear(size);
3256 isl_int_clear(left);
3259 /* Fill up the groups array with singleton groups, i.e., one group
3260 * per reference, initializing the array, access, write and refs fields.
3261 * In particular the access field is initialized to the scheduled
3262 * access relation of the array reference.
3264 * Return the number of elements initialized, i.e., the number of
3265 * active references in the current kernel.
3267 static int populate_array_references(struct cuda_gen *gen,
3268 struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3269 struct cuda_array_ref_group **groups)
3271 int i;
3272 int n;
3273 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3275 n = 0;
3276 for (i = 0; i < array->n_ref; ++i) {
3277 isl_union_map *umap;
3278 isl_map *map;
3279 struct cuda_array_ref_group *group;
3280 struct cuda_stmt_access *access = array->refs[i];
3282 map = isl_map_copy(access->access);
3283 umap = isl_union_map_from_map(map);
3284 umap = isl_union_map_apply_domain(umap,
3285 isl_union_map_copy(sched));
3287 if (isl_union_map_is_empty(umap)) {
3288 isl_union_map_free(umap);
3289 continue;
3292 map = isl_map_from_union_map(umap);
3293 map = isl_map_detect_equalities(map);
3295 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3296 assert(group);
3297 group->array = array;
3298 group->access = map;
3299 group->write = access->write;
3300 group->refs = &array->refs[i];
3302 groups[n++] = group;
3305 return n;
3308 static void free_array_ref_group(struct cuda_array_ref_group *group,
3309 int n_index)
3311 if (!group)
3312 return;
3313 free_bound_list(group->shared_bound, n_index);
3314 free_bound_list(group->private_bound, n_index);
3315 isl_map_free(group->access);
3316 free(group->refs);
3317 free(group);
3320 /* Given a set where the parameters gen->first_shared up to
3321 * gen->first_shared + gen->shared_len represent the tile loops,
3322 * eliminate the innermost of those that have a fixed value
3323 * until we reach one that does not (obviously) have a fixed value.
3325 static __isl_give isl_set *eliminate_fixed_inner_loops(struct cuda_gen *gen,
3326 __isl_take isl_set *access)
3328 int i;
3330 for (i = gen->shared_len - 1; i >= 0; --i) {
3331 int pos = gen->first_shared + i;
3332 if (!isl_set_plain_is_fixed(access, isl_dim_param, pos, NULL))
3333 break;
3334 access = isl_set_eliminate(access, isl_dim_param, pos, 1);
3336 return access;
3339 /* Check if the accessed set of group1 and group2 overlap within
3340 * the innermost loop. In particular, ignore any inner dimension
3341 * with a fixed value.
3342 * The copying to and from shared memory will be performed within
3343 * the innermost actual loop so we are only allowed to consider
3344 * the dimensions up to that innermost loop while checking whether
3345 * two access sets overlap.
3347 static int accesses_overlap(struct cuda_gen *gen,
3348 struct cuda_array_ref_group *group1,
3349 struct cuda_array_ref_group *group2)
3351 int empty;
3352 isl_set *access1, *access2;
3354 access1 = isl_map_range(isl_map_copy(group1->access));
3355 access1 = eliminate_fixed_inner_loops(gen, access1);
3356 access2 = isl_map_range(isl_map_copy(group2->access));
3357 access2 = eliminate_fixed_inner_loops(gen, access2);
3358 access1 = isl_set_intersect(access1, access2);
3359 empty = isl_set_is_empty(access1);
3360 isl_set_free(access1);
3362 return !empty;
3365 /* If two groups have overlapping access relations (within the innermost
3366 * loop) and if one of them involves a write, then merge the two groups
3367 * into one.
3369 * We keep track of the grouping in "leader". leader[j] points to
3370 * an earlier group array element that belongs to the same group,
3371 * or the array element j itself if this element is the first in the group.
3373 * Return the number of group leaders.
3375 static int group_overlapping_writes(struct cuda_gen *gen, int n,
3376 struct cuda_array_ref_group **groups, int *leader)
3378 int i, j;
3379 int n_group = n;
3381 for (i = 0; i < n; ++i) {
3382 int l = i;
3383 groups[l]->n_ref = 1;
3384 for (j = i - 1; j >= 0; --j) {
3385 if (leader[j] != j)
3386 continue;
3387 if (!groups[l]->write && !groups[j]->write)
3388 continue;
3390 if (!accesses_overlap(gen, groups[l], groups[j]))
3391 continue;
3393 groups[j]->access = isl_map_union(groups[j]->access,
3394 groups[l]->access);
3395 groups[j]->write = 1;
3396 groups[l]->access = NULL;
3397 groups[j]->n_ref += groups[l]->n_ref;
3398 l = leader[l] = j;
3399 n_group--;
3401 leader[i] = l;
3404 return n_group;
3407 /* Compute the size of the shared array corresponding to the given array
3408 * array refrence group, based on the accesses from the current kernel,
3409 * as well as the offset of the shared piece in the original array.
3411 static void compute_group_shared_bound(struct cuda_gen *gen,
3412 struct cuda_array_info *array, struct cuda_array_ref_group *group)
3414 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3416 if (!gen->options->use_shared_memory)
3417 return;
3418 if (cuda_array_is_read_only_scalar(array))
3419 return;
3421 group->shared_bound = create_bound_list(ctx, array->n_index);
3422 if (!can_tile_for_shared_memory(gen, array, group->access,
3423 group->shared_bound)) {
3424 free_bound_list(group->shared_bound, array->n_index);
3425 group->shared_bound = NULL;
3429 /* Is the size of the tile specified by "bound" smaller than the sum of
3430 * the sizes of the tiles specified by "bound1" and "bound2"?
3432 static int smaller_tile(unsigned n_index, struct cuda_array_bound *bound,
3433 struct cuda_array_bound *bound1, struct cuda_array_bound *bound2)
3435 int smaller;
3436 isl_int size, size1, size2;
3438 isl_int_init(size);
3439 isl_int_init(size1);
3440 isl_int_init(size2);
3442 tile_size(n_index, bound, &size);
3443 tile_size(n_index, bound1, &size1);
3444 tile_size(n_index, bound2, &size2);
3446 isl_int_sub(size, size, size1);
3447 isl_int_sub(size, size, size2);
3448 smaller = isl_int_is_neg(size);
3450 isl_int_clear(size2);
3451 isl_int_clear(size1);
3452 isl_int_clear(size);
3454 return smaller;
3457 /* Given an initial grouping of array references and shared memory tiles
3458 * for each group that allows for a shared memory tile, merge two groups
3459 * if both have a shared memory tile, the merged group also has
3460 * a shared memory tile and the size of the tile for the merge group
3461 * is smaller than the sum of the tile sizes of the individual groups.
3463 * Return the number of group leaders after merging.
3465 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3466 struct cuda_array_info *array, int n,
3467 struct cuda_array_ref_group **groups, int *leader, int n_group)
3469 int i, j;
3470 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3472 for (i = 0; n_group > 1 && i < n; ++i) {
3473 int l = i;
3474 if (leader[i] != i)
3475 continue;
3476 if (!groups[i]->shared_bound)
3477 continue;
3478 for (j = i - 1; j >= 0; --j) {
3479 isl_map *map;
3480 int empty;
3481 struct cuda_array_bound *shared_bound;
3483 if (leader[j] != j)
3484 continue;
3485 if (!groups[j]->shared_bound)
3486 continue;
3488 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3489 isl_map_copy(groups[j]->access));
3490 empty = isl_map_is_empty(map);
3491 isl_map_free(map);
3493 if (empty)
3494 continue;
3496 map = isl_map_union(isl_map_copy(groups[l]->access),
3497 isl_map_copy(groups[j]->access));
3498 shared_bound = create_bound_list(ctx, array->n_index);
3499 if (!can_tile_for_shared_memory(gen, array, map,
3500 shared_bound) ||
3501 !smaller_tile(array->n_index, shared_bound,
3502 groups[l]->shared_bound,
3503 groups[j]->shared_bound)) {
3504 isl_map_free(map);
3505 free_bound_list(shared_bound, array->n_index);
3506 continue;
3509 free_bound_list(groups[j]->shared_bound,
3510 array->n_index);
3511 groups[j]->shared_bound = shared_bound;
3512 isl_map_free(groups[j]->access);
3513 groups[j]->access = map;
3514 groups[j]->n_ref += groups[l]->n_ref;
3515 l = leader[l] = j;
3516 n_group--;
3520 return n_group;
3523 /* Extract an array of array reference groups from the array of references
3524 * and the grouping information in "leader".
3526 * Store the results in array->n_group and array->groups.
3528 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3529 int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3531 int i, j;
3533 for (i = 2; i < n; ++i)
3534 leader[i] = leader[leader[i]];
3536 array->n_group = n_group;
3537 array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3538 n_group);
3539 assert(array->groups);
3541 j = 0;
3542 for (i = 0; i < n; ++i) {
3543 int k, l;
3544 struct cuda_stmt_access **refs;
3546 if (leader[i] != i) {
3547 groups[i]->refs = NULL;
3548 free_array_ref_group(groups[i], array->n_index);
3549 continue;
3552 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3553 groups[i]->n_ref);
3554 assert(refs);
3555 l = 0;
3556 for (k = i; k < n; ++k)
3557 if (leader[k] == i) {
3558 refs[l++] = *groups[k]->refs;
3559 (*groups[k]->refs)->group = j;
3562 groups[i]->refs = refs;
3563 groups[i]->nr = j;
3564 array->groups[j++] = groups[i];
3568 /* Group array references that should be considered together when
3569 * deciding whether to access them from private, shared or global memory.
3571 * In particular, if two array references overlap and if one of them
3572 * is a write, then the two references are grouped together.
3573 * Furthermore, if two groups admit a shared memory tile and if the
3574 * combination of the two also admits a shared memory tile, we merge
3575 * the two groups.
3577 * During the construction the group->refs field points to a single
3578 * array reference inside the array of array references, while
3579 * group->n_ref contains the number of element in leader that
3580 * (directly or indirectly) point to this group, provided the group
3581 * is a leader.
3583 static void group_array_references(struct cuda_gen *gen,
3584 struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3586 int i;
3587 int n, n_group;
3588 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3589 struct cuda_array_ref_group **groups;
3590 int *leader;
3592 groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3593 array->n_ref);
3594 assert(groups);
3596 n = populate_array_references(gen, array, sched, groups);
3598 leader = isl_alloc_array(ctx, int, n);
3599 assert(leader);
3601 n_group = group_overlapping_writes(gen, n, groups, leader);
3603 for (i = 0; i < n; ++i)
3604 if (leader[i] == i)
3605 compute_group_shared_bound(gen, array, groups[i]);
3607 n_group = group_common_shared_memory_tile(gen, array, n, groups,
3608 leader, n_group);
3610 extract_array_groups(ctx, array, n, groups, leader, n_group);
3612 free(leader);
3613 free(groups);
3616 /* Take tiled_sched, project it onto the shared tile loops and
3617 * the loops that will be wrapped over the threads,
3618 * parametrize the shared tile loops and store the result in gen->shared_sched.
3619 * The position of the first of these parameters is stored in gen->first_shared.
3620 * Also compute a projection that projects out the loops that will be
3621 * wrapped over the threads and store this projection in gen->shared_proj.
3623 static void compute_shared_sched(struct cuda_gen *gen)
3625 isl_space *dim;
3626 isl_map *proj;
3627 isl_set *par;
3628 isl_union_map *sched;
3630 sched = isl_union_map_copy(gen->tiled_sched);
3632 dim = isl_union_map_get_space(sched);
3633 gen->first_shared = isl_space_dim(dim, isl_dim_param);
3634 proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3635 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3637 dim = isl_union_map_get_space(sched);
3638 par = parametrization(dim, gen->shared_len + gen->n_block,
3639 0, gen->shared_len, "g");
3640 sched = isl_union_map_intersect_range(sched,
3641 isl_union_set_from_set(par));
3643 dim = isl_union_map_get_space(sched);
3644 proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3646 gen->shared_sched = sched;
3647 gen->shared_proj = isl_union_map_from_map(proj);
3650 /* Group references of all arrays in the program.
3652 static void group_references(struct cuda_gen *gen)
3654 int i;
3655 isl_union_map *sched;
3657 sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3658 isl_union_map_copy(gen->shared_proj));
3660 for (i = 0; i < gen->n_array; ++i)
3661 group_array_references(gen, &gen->array[i], sched);
3663 isl_union_map_free(sched);
3666 /* Free all array information that is local to the current kernel.
3668 static void free_local_array_info(struct cuda_gen *gen)
3670 int i, j;
3672 for (i = 0; i < gen->n_array; ++i) {
3673 struct cuda_array_info *array = &gen->array[i];
3675 for (j = 0; j < array->n_group; ++j)
3676 free_array_ref_group(array->groups[j], array->n_index);
3677 free(array->groups);
3679 if (array->n_group == 0)
3680 continue;
3681 for (j = 0; j < gen->array[i].n_index; ++j) {
3682 isl_pw_aff_free(gen->array[i].local_bound[j]);
3683 gen->array[i].local_bound[j] = NULL;
3688 /* The sizes of the arrays on the host that have been computed by
3689 * extract_array_info may depend on the parameters. Use the extra
3690 * constraints on the parameters that are valid at "host_domain"
3691 * to simplify these expressions.
3693 static void localize_bounds(struct cuda_gen *gen,
3694 __isl_keep isl_set *host_domain)
3696 int i, j;
3697 isl_set *context;
3699 context = isl_set_copy(host_domain);
3700 context = isl_set_params(host_domain);
3702 for (i = 0; i < gen->n_array; ++i) {
3703 struct cuda_array_info *array = &gen->array[i];
3705 if (array->n_group == 0)
3706 continue;
3708 for (j = 0; j < array->n_index; ++j) {
3709 isl_pw_aff *pwaff;
3711 pwaff = isl_pw_aff_copy(array->bound[j]);
3712 pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3713 array->local_bound[j] = pwaff;
3716 isl_set_free(context);
3719 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3720 * in the statement list u.
3721 * Because of the way the schedule is constructed, the other statements
3722 * in the list, if any, should have the same values for these properties.
3724 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3726 int nr;
3727 struct cuda_stmt *stmt;
3729 nr = atoi(u->statement->name + 2);
3730 stmt = &gen->stmts[nr];
3732 gen->tile_len = stmt->tile_len;
3733 gen->n_parallel = stmt->n_parallel;
3736 /* Extract a description of the grid, i.e., the possible values
3737 * of the block ids, from gen->tiled_sched.
3738 * The block ids are parameters in gen->tiled_sched.
3739 * We simply need to change them into set dimensions.
3741 static __isl_give isl_set *extract_grid(struct cuda_gen *gen)
3743 int i;
3744 isl_set *grid;
3746 grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3747 grid = isl_set_from_params(grid);
3748 grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3749 for (i = 0; i < gen->n_grid; ++i) {
3750 int pos;
3751 char name[20];
3753 snprintf(name, sizeof(name), "b%d", i);
3754 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3755 assert(pos >= 0);
3756 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3757 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3760 return grid;
3763 /* Print the effective grid size as a list of the sizes in each
3764 * dimension, from innermost to outermost.
3766 * The grid size specified by the user or set by default
3767 * in read_grid_sizes() and applied in tile_schedule(),
3768 * may be too large for the given code in the sense that
3769 * it may contain blocks that don't need to execute anything.
3770 * We therefore don't print this grid size, but instead the
3771 * smallest grid size that ensures that all blocks that actually
3772 * execute code are included in the grid.
3774 * For each block dimension, we compute the maximal value of the block id
3775 * and add one.
3777 static void print_grid_size(struct cuda_gen *gen, __isl_take isl_set *context)
3779 int i;
3780 isl_printer *prn;
3781 isl_set *grid;
3783 if (gen->n_grid == 0) {
3784 isl_set_free(context);
3785 return;
3788 grid = extract_grid(gen);
3790 prn = isl_printer_to_file(gen->ctx, gen->cuda.host_c);
3791 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3793 prn = isl_printer_print_str(prn, "(");
3794 for (i = gen->n_grid - 1; i >= 0; --i) {
3795 isl_space *space;
3796 isl_aff *one;
3797 isl_pw_aff *bound = isl_set_dim_max(isl_set_copy(grid), i);
3799 bound = isl_pw_aff_coalesce(bound);
3800 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3802 space = isl_pw_aff_get_domain_space(bound);
3803 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3804 one = isl_aff_add_constant_si(one, 1);
3805 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3806 prn = isl_printer_print_pw_aff(prn, bound);
3807 isl_pw_aff_free(bound);
3809 if (i > 0)
3810 prn = isl_printer_print_str(prn, ", ");
3812 prn = isl_printer_print_str(prn, ")");
3814 isl_printer_free(prn);
3815 isl_set_free(grid);
3816 isl_set_free(context);
3819 /* This function is called for each leaf in the clast of the host code.
3820 * We first specialize the schedule to the site of the leaf, compute
3821 * the size of shared memory and then print the body of host code
3822 * and the associated kernel (through a call to print_kernel_body).
3824 static void print_host_user(struct clast_printer_info *code,
3825 struct clast_user_stmt *u)
3827 struct cuda_gen *gen = code->user;
3828 isl_space *dim;
3829 isl_set *par;
3830 isl_set *host_domain;
3831 isl_union_map *access;
3832 isl_union_map *local_sched;
3833 isl_union_set *arrays;
3835 set_tile_len(gen, u);
3836 read_sizes(gen);
3838 host_domain = extract_entire_host_domain(&u->stmt);
3840 local_sched = isl_union_map_intersect_range(
3841 isl_union_map_copy(gen->sched),
3842 isl_union_set_from_set(extend(isl_set_copy(host_domain),
3843 gen->untiled_len)));
3844 access = isl_union_map_union(isl_union_map_copy(gen->read),
3845 isl_union_map_copy(gen->write));
3846 access = isl_union_map_apply_domain(access,
3847 isl_union_map_copy(local_sched));
3848 arrays = isl_union_map_range(access);
3850 print_indent(code->dst, code->indent);
3851 fprintf(code->dst, "dim3 k%d_dimBlock", gen->kernel_id);
3852 print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3853 fprintf(code->dst, ";\n");
3855 gen->tiled_sched = tile_schedule(gen, local_sched);
3856 gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3857 gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3859 print_indent(code->dst, code->indent);
3860 fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
3861 print_grid_size(gen, isl_set_params(isl_set_copy(host_domain)));
3862 fprintf(code->dst, ";\n");
3864 gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3866 dim = isl_union_map_get_space(gen->local_sched);
3867 par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3868 gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3869 isl_union_set_from_set(par));
3871 gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3872 gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3874 gen->private_access = NULL;
3875 compute_shared_sched(gen);
3876 gen->privatization = compute_privatization(gen);
3877 group_references(gen);
3878 compute_private_size(gen);
3879 check_shared_memory_bound(gen);
3880 localize_bounds(gen, host_domain);
3882 gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3884 print_kernel_launch(gen, arrays);
3886 fprintf(gen->cuda.kernel_c, "{\n");
3888 print_kernel_body(gen, host_domain, gen->tiled_sched);
3890 fprintf(gen->cuda.kernel_c, "}\n");
3892 free_local_array_info(gen);
3893 isl_map_free(gen->privatization);
3894 isl_union_map_free(gen->private_access);
3895 isl_union_map_free(gen->local_sched);
3896 isl_union_map_free(gen->tiled_sched);
3897 isl_union_map_free(gen->shared_sched);
3898 isl_union_map_free(gen->shared_proj);
3899 isl_union_set_free(arrays);
3900 isl_set_free(host_domain);
3902 free(gen->tile_size);
3903 gen->kernel_id++;
3906 /* Use CLooG to generate code for the outer gen->tile_first loops
3907 * of the global schedule in gen->sched.
3908 * The pretty printing of this code is handled by print_clast,
3909 * which calls print_host_user for each kernel invocation location.
3911 static void print_cloog_host_code(struct cuda_gen *gen)
3913 int i;
3914 isl_set *context;
3915 isl_union_map *sched;
3916 CloogOptions *options;
3917 CloogDomain *cloog_context;
3918 CloogUnionDomain *ud;
3919 CloogInput *input;
3920 struct clast_stmt *stmt;
3921 char name[20];
3923 options = cloog_options_malloc(gen->state);
3924 options->language = CLOOG_LANGUAGE_C;
3925 options->otl = 0;
3926 options->strides = 1;
3927 options->stop = gen->tile_first;
3928 options->f = gen->untiled_len;
3929 options->l = gen->untiled_len;
3930 options->save_domains = 1;
3931 options->noscalars = 1;
3933 sched = isl_union_map_copy(gen->sched);
3934 ud = cloog_union_domain_from_isl_union_map(sched);
3935 for (i = 0; i < options->stop; ++i) {
3936 snprintf(name, sizeof(name), "h%d", i);
3937 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3939 context = isl_set_copy(gen->context);
3940 cloog_context = cloog_domain_from_isl_set(context);
3941 input = cloog_input_alloc(cloog_context, ud);
3943 stmt = cloog_clast_create_from_input(input, options);
3945 gen->code.indent = 0;
3946 gen->code.dst = gen->cuda.host_c;
3947 gen->code.print_user_stmt = NULL;
3948 gen->code.print_user_stmt_list = &print_host_user;
3949 gen->code.print_for_head = NULL;
3950 gen->code.print_for_foot = NULL;
3951 gen->code.user = gen;
3952 print_clast(&gen->code, stmt);
3954 cloog_clast_free(stmt);
3955 cloog_options_free(options);
3956 fprintf(gen->cuda.host_c, "\n");
3959 void print_cuda_macros(struct cuda_gen *gen)
3961 const char *macros =
3962 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
3963 "#define cudaCheckKernel()"
3964 " assert(cudaGetLastError() == cudaSuccess)\n\n";
3965 fputs(macros, gen->cuda.host_c);
3968 void print_host_code(struct cuda_gen *gen)
3970 fprintf(gen->cuda.host_c, "{\n");
3971 print_cloog_macros(gen->cuda.host_c);
3972 print_cloog_macros(gen->cuda.kernel_c);
3974 print_cuda_macros(gen);
3976 declare_device_arrays(gen);
3978 allocate_device_arrays(gen);
3979 copy_arrays_to_device(gen);
3981 gen->kernel_id = 0;
3982 print_cloog_host_code(gen);
3984 copy_arrays_from_device(gen);
3985 free_device_arrays(gen);
3987 fprintf(gen->cuda.host_c, "}\n");
3990 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
3991 const char *str)
3993 isl_ctx *ctx;
3994 isl_set *context;
3996 if (!str)
3997 return set;
3999 ctx = isl_set_get_ctx(set);
4000 context = isl_set_read_from_str(ctx, str);
4001 context = isl_set_align_params(context, isl_set_get_space(set));
4002 set = isl_set_intersect(set, context);
4004 return set;
4007 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4009 if (!str)
4010 return NULL;
4011 return isl_union_map_read_from_str(ctx, str);
4014 /* Return the union of all iteration domains of the gen->stmts[i].
4016 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
4018 int i;
4019 isl_union_set *domain;
4021 domain = isl_union_set_empty(isl_set_get_space(gen->context));
4022 for (i = 0; i < gen->n_stmts; ++i) {
4023 isl_set *domain_i;
4025 domain_i = isl_set_copy(gen->stmts[i].domain);
4026 domain = isl_union_set_union(domain,
4027 isl_union_set_from_set(domain_i));
4030 return domain;
4033 /* Information about the outermost tilable bands in the forest of bands.
4035 * tile_len and n_parallel are only sets on band_info structures
4036 * that correspond to outermost bands. For other bands (in particular,
4037 * ancestors of the outermost bands), n_parallal is set to 0.
4039 * prefix is the (padded) schedule leading up to the outermost tilable bands.
4041 * tile_first is the number of schedule dimensions in prefix.
4043 * suffix is the schedule of the outermost tilable bands and their descendants.
4045 struct band_info {
4046 struct cuda_gen *gen;
4047 int tile_first;
4048 int tile_len;
4049 int n_parallel;
4050 isl_union_map *prefix;
4051 isl_union_map *suffix;
4054 /* Set tile_len and n_parallel of the statement to that of
4055 * their outermost band, recorded in the band_info.
4057 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4059 struct band_info *info = user;
4060 int nr;
4061 struct cuda_stmt *stmt;
4063 nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
4064 stmt = &info->gen->stmts[nr];
4066 stmt->tile_len = info->tile_len;
4067 stmt->n_parallel = info->n_parallel;
4069 isl_map_free(map);
4071 return 0;
4074 static void list_select_outer_band(struct cuda_gen *gen,
4075 __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4077 /* Check if this band has any parallel loops. If so, take it as
4078 * the outermost tilable band. If not, continue looking for the
4079 * outermost tilable band in the children of the current band.
4081 static void band_select_outer_band(struct cuda_gen *gen,
4082 __isl_take isl_band *band, int pos, struct band_info *info)
4084 int n = isl_band_n_member(band);
4085 int n_parallel;
4087 for (n_parallel = 0; n_parallel < n; ++n_parallel)
4088 if (!isl_band_member_is_zero_distance(band, n_parallel))
4089 break;
4091 info->n_parallel = n_parallel;
4092 if (n_parallel) {
4093 info->gen = gen;
4094 info->tile_first = pos;
4095 info->tile_len = n;
4096 info->prefix = isl_band_get_prefix_schedule(band);
4097 info->suffix = isl_union_map_flat_range_product(
4098 isl_band_get_partial_schedule(band),
4099 isl_band_get_suffix_schedule(band));
4100 isl_union_map_foreach_map(info->prefix,
4101 &set_stmt_tile_len, info);
4102 } else if (isl_band_has_children(band)) {
4103 isl_band_list *children;
4104 children = isl_band_get_children(band);
4105 list_select_outer_band(gen, children, pos + n, info);
4106 } else {
4107 info->gen = gen;
4108 info->tile_first = pos + n;
4109 info->tile_len = 0;
4110 info->prefix = isl_union_map_flat_range_product(
4111 isl_band_get_prefix_schedule(band),
4112 isl_band_get_partial_schedule(band));
4113 info->suffix = isl_band_get_suffix_schedule(band);
4114 isl_union_map_foreach_map(info->prefix,
4115 &set_stmt_tile_len, info);
4118 isl_band_free(band);
4121 /* Comparison function that returns a non-zero value for band_infos
4122 * with different tile_len fields or different n_parallel fields.
4124 static int cmp_band(const void *p1, const void *p2)
4126 const struct band_info *info1 = p1;
4127 const struct band_info *info2 = p2;
4129 if (info1->tile_len != info2->tile_len)
4130 return info1->tile_len - info2->tile_len;
4132 return info1->n_parallel - info2->n_parallel;
4135 /* Extend "umap" with coordinates with fixed value "val"
4136 * to a total length of "dst_len", assuming the original dimension is "src_len".
4138 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
4139 int src_len, int dst_len, int val)
4141 isl_space *dim;
4142 isl_map *map;
4143 int i;
4145 dim = isl_union_map_get_space(umap);
4146 map = isl_map_reverse(projection(dim, dst_len, src_len));
4147 for (i = src_len; i < dst_len; ++i)
4148 map = isl_map_fix_si(map, isl_dim_out, i, val);
4150 umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4152 return umap;
4155 /* Group bands with the same values for tile_len and n_parallel.
4156 * The prefix schedule is then extended with a fixed coordinate that
4157 * is different for each such group.
4158 * Note that the actual values for this coordinate are not important.
4159 * The bands have already been effectively separated at a higher level
4160 * or they are independent and may be executed in parallel.
4161 * The list of band_info has been sorted before this functions is called.
4163 static void separate_bands(struct band_info *info, int n)
4165 int i;
4166 int j = 0;
4168 for (i = 0; i < n; ++i) {
4169 int l = info[i].tile_first;
4171 if (i &&
4172 (info[i].tile_len != info[i - 1].tile_len ||
4173 info[i].n_parallel != info[i - 1].n_parallel))
4174 j++;
4176 info[i].prefix = extend_range(info[i].prefix,
4177 l, l + 1, j);
4178 info[i].tile_first = l + 1;
4182 /* Select the outermost bands in the elements of the list, align
4183 * their prefix schedules, separate bands with different values
4184 * for tile_len and/or n_parallel and then combine the resulting
4185 * prefix and suffix schedules into a single pair of prefix and
4186 * suffix schedules for the entire list.
4188 static void list_select_outer_band(struct cuda_gen *gen,
4189 __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4191 isl_band *band;
4192 int i;
4193 int n = isl_band_list_n_band(list);
4194 isl_ctx *ctx = isl_band_list_get_ctx(list);
4195 struct band_info *info;
4196 int max_tile_first;
4197 isl_union_map *prefix;
4198 isl_union_map *suffix;
4200 assert(n >= 1);
4201 info = isl_calloc_array(ctx, struct band_info, n);
4202 assert(info);
4204 max_tile_first = 0;
4205 for (i = 0; i < n; ++i) {
4206 band = isl_band_list_get_band(list, i);
4207 band_select_outer_band(gen, band, pos, &info[i]);
4208 if (info[i].tile_first > max_tile_first)
4209 max_tile_first = info[i].tile_first;
4212 for (i = 0; i < n; ++i) {
4213 if (info[i].tile_first == max_tile_first)
4214 continue;
4215 info[i].prefix = extend_range(info[i].prefix,
4216 info[i].tile_first, max_tile_first, 0);
4217 info[i].tile_first = max_tile_first;
4220 qsort(info, n, sizeof(struct band_info), &cmp_band);
4222 for (i = 0; i < n - 1; ++i)
4223 if (info[i].tile_len != info[i + 1].tile_len ||
4224 info[i].n_parallel != info[i + 1].n_parallel)
4225 break;
4227 if (i < n -1)
4228 separate_bands(info, n);
4230 prefix = info[0].prefix;
4231 suffix = info[0].suffix;
4233 for (i = 1; i < n; ++i) {
4234 prefix = isl_union_map_union(prefix, info[i].prefix);
4235 suffix = isl_union_map_union(suffix, info[i].suffix);
4238 list_info->tile_first = info[0].tile_first;
4239 list_info->tile_len = -1;
4240 list_info->prefix = prefix;
4241 list_info->suffix = suffix;
4243 isl_band_list_free(list);
4244 free(info);
4247 /* Set max_out to the maximal number of output dimensions over
4248 * all maps.
4250 static int update_max_out(__isl_take isl_map *map, void *user)
4252 int *max_out = user;
4253 int n_out = isl_map_dim(map, isl_dim_out);
4255 if (n_out > *max_out)
4256 *max_out = n_out;
4258 isl_map_free(map);
4259 return 0;
4262 struct align_range_data {
4263 int max_out;
4264 isl_union_map *res;
4267 /* Extend the dimension of the range of the given map to data->max_out and
4268 * then add the result to data->res.
4270 static int map_align_range(__isl_take isl_map *map, void *user)
4272 struct align_range_data *data = user;
4273 int i;
4274 isl_space *dim;
4275 isl_map *proj;
4276 int n_out = isl_map_dim(map, isl_dim_out);
4278 dim = isl_union_map_get_space(data->res);
4279 proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4280 for (i = n_out; i < data->max_out; ++i)
4281 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4283 map = isl_map_apply_range(map, proj);
4285 data->res = isl_union_map_add_map(data->res, map);
4287 return 0;
4290 /* Extend the ranges of the maps in the union map such they all have
4291 * the same dimension.
4293 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4295 struct align_range_data data;
4297 data.max_out = 0;
4298 isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4300 data.res = isl_union_map_empty(isl_union_map_get_space(umap));
4301 isl_union_map_foreach_map(umap, &map_align_range, &data);
4303 isl_union_map_free(umap);
4304 return data.res;
4307 /* Select the outermost tilable band that (by construction)
4308 * has at least one parallel loop.
4309 * The starting position of the aligned band is stored in the pair
4310 * gen->tile_first.
4311 * The sizes and number of parallel loops may be different in different
4312 * parts of the band forest and are therefore stored in the cuda_stmts.
4314 * Return the complete schedule, with the tilable bands aligned
4315 * at gen->tile_first and padded with zero, if needed.
4317 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4318 __isl_keep isl_schedule *schedule)
4320 isl_band_list *list;
4321 struct band_info info;
4323 gen->n_parallel = 0;
4324 gen->tile_len = -1;
4326 list = isl_schedule_get_band_forest(schedule);
4328 list_select_outer_band(gen, list, 0, &info);
4330 gen->tile_first = info.tile_first;
4331 info.suffix = align_range(info.suffix);
4333 return isl_union_map_flat_range_product(info.prefix, info.suffix);
4336 /* Set gen->untiled_len to the number of scheduling dimensions
4337 * for the schedule of the first domain.
4338 * We assume here that this number is the same for all domains.
4340 static int set_untiled_len(__isl_take isl_map *map, void *user)
4342 unsigned *untiled_len = user;
4344 *untiled_len = isl_map_dim(map, isl_dim_out);
4346 isl_map_free(map);
4347 return -1;
4350 /* Compute an appropriate schedule based on the accesses in
4351 * gen->read and gen->write.
4353 * We first compute dependences and then use those to compute
4354 * a schedule that has a parallel loop in each tilable band.
4355 * Finally, we select the outermost tilable band.
4357 static void compute_schedule(struct cuda_gen *gen,
4358 __isl_take isl_union_map *sched)
4360 isl_union_set *domain;
4361 isl_union_map *empty;
4362 isl_union_map *dep_raw, *dep2, *dep3, *dep;
4363 isl_union_map *uninitialized;
4364 isl_schedule *schedule;
4366 empty = isl_union_map_empty(isl_union_map_get_space(sched));
4368 isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4369 isl_union_map_copy(gen->write), empty,
4370 isl_union_map_copy(sched),
4371 &dep_raw, NULL, &uninitialized, NULL);
4372 isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4373 isl_union_map_copy(gen->write),
4374 isl_union_map_copy(gen->read),
4375 isl_union_map_copy(sched),
4376 &dep2, &dep3, NULL, NULL);
4377 isl_union_map_free(sched);
4379 gen->copy_in = isl_union_map_range(uninitialized);
4381 dep = isl_union_map_union(dep2, dep3);
4382 dep = isl_union_map_union(dep, dep_raw);
4383 dep = isl_union_map_coalesce(dep);
4385 domain = extract_domain(gen);
4386 schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4387 isl_union_map_copy(dep), dep);
4389 sched = select_outer_tilable_band(gen, schedule);
4391 isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4392 sched = isl_union_map_intersect_domain(sched, domain);
4393 gen->sched = sched;
4395 isl_schedule_free(schedule);
4398 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4399 struct cuda_stmt_access **next_access)
4401 struct cuda_stmt_access *access;
4402 isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4404 access = isl_alloc_type(ctx, struct cuda_stmt_access);
4405 assert(access);
4406 access->next = NULL;
4407 access->read = expr->acc.read;
4408 access->write = expr->acc.write;
4409 access->access = isl_map_copy(expr->acc.access);
4411 *next_access = access;
4412 next_access = &(*next_access)->next;
4413 return next_access;
4416 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4417 struct cuda_stmt_access **next_access)
4419 int i;
4421 for (i = 0; i < expr->n_arg; ++i)
4422 next_access = expr_extract_accesses(expr->args[i],
4423 next_access);
4425 if (expr->type == pet_expr_access)
4426 next_access = expr_extract_access(expr, next_access);
4428 return next_access;
4431 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4433 struct cuda_stmt_access **next_access = &stmt->accesses;
4435 stmt->accesses = NULL;
4436 expr_extract_accesses(stmt->body, next_access);
4439 /* Return an array of cuda_stmt representing the statements in "scop".
4441 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4442 __isl_keep isl_set *context)
4444 int i;
4445 struct cuda_stmt *stmts;
4447 stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4448 assert(stmts);
4450 for (i = 0; i < scop->n_stmt; ++i) {
4451 struct cuda_stmt *s = &stmts[i];
4453 s->domain = isl_set_copy(scop->stmts[i]->domain);
4454 s->domain = isl_set_intersect_params(s->domain,
4455 isl_set_copy(context));
4456 s->body = scop->stmts[i]->body;
4457 pet_stmt_extract_accesses(s);
4460 return stmts;
4463 /* Replace the scop in the "input" file by equivalent code
4464 * that uses the GPU. "scop" is assumed to correspond to this scop.
4466 * We first compute a schedule that respects the dependences
4467 * of the original program and select the outermost band
4468 * of tilable dimensions that has at least one parallel loop.
4469 * We then have three blocks of dimensions
4471 * H B G
4473 * The tilable band "B" is first tiled according to "tile" sizes, resulting
4474 * in
4476 * H T P G
4478 * For each iteration of the T loop and for each array, we compute
4479 * the array elements accessed by that iteration, construct a rectangular
4480 * box around it and shift it to the origin. The result is used
4481 * as shared memory for the array.
4483 * We then split off at most 2 parallel loops from the T loops and
4484 * at most 3 parallel loops from the P loops
4486 * H T1 T2 P1 P2 G
4488 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4489 * according to "grid"/"block" sizes.
4491 * H T1T T1P T2 P1T P1P P2 G
4493 * Finally, the T1P and P1P iterators are equated to the block and
4494 * thread dimensions respectively and so are effectively removed.
4495 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4496 * are run on the GPU.
4498 * Code is generated in three stages. We first generate code for the
4499 * host (the H loops), with iterators h%d. Then, for each leaf node
4500 * of the resulting AST, we generate code for the shared loops (up to
4501 * and including T2), with iterators g%d and after equating the H loops
4502 * to h%d parameters and the T1P loops to the block dimensions.
4503 * Finally, we generate code for the remaining loops in a similar fashion.
4505 int generate_cuda(isl_ctx *ctx, struct pet_scop *scop,
4506 struct ppcg_options *options, const char *input)
4508 isl_union_map *sched;
4509 struct cuda_gen gen;
4511 if (!scop)
4512 return -1;
4514 scop = pet_scop_align_params(scop);
4516 gen.ctx = ctx;
4517 gen.context = isl_set_copy(scop->context);
4518 gen.context = add_context_from_str(gen.context, options->ctx);
4519 gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4520 gen.n_stmts = scop->n_stmt;
4521 gen.stmts = extract_stmts(ctx, scop, gen.context);
4522 gen.read = pet_scop_collect_reads(scop);
4523 gen.write = pet_scop_collect_writes(scop);
4524 gen.options = options;
4525 gen.state = cloog_isl_state_malloc(gen.ctx);
4526 gen.scop = scop;
4528 cuda_open_files(&gen.cuda, input);
4530 collect_array_info(&gen);
4532 sched = pet_scop_collect_schedule(scop);
4534 compute_schedule(&gen, sched);
4536 print_host_code(&gen);
4538 cloog_state_free(gen.state);
4539 clear_cuda_gen(&gen);
4541 cuda_close_files(&gen.cuda);
4543 return 0;