cuda.c: fix typo in comment
[ppcg.git] / cuda.c
blob6cf9f3326961fa211eae8c01dd9648f3081ec22d
1 /*
2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
8 * 91893 Orsay, France
9 */
11 #include <assert.h>
12 #include <stdlib.h>
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
16 #include <isl/aff.h>
17 #include <isl/ilp.h>
18 #include <isl/flow.h>
19 #include <isl/band.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
24 #include "cuda.h"
25 #include "cuda_common.h"
26 #include "gpucode.h"
27 #include "schedule.h"
28 #include "ppcg_options.h"
30 /* The fields stride, shift and shift_map only contain valid information
31 * if shift != NULL.
32 * If so, they express that current index is such that if you add shift,
33 * then the result is always a multiple of stride.
34 * shift_map contains the mapping
36 * i -> (i + shift)/stride
38 struct cuda_array_bound {
39 isl_int size;
40 isl_aff *lb;
42 isl_int stride;
43 isl_aff *shift;
44 isl_basic_map *shift_map;
47 struct cuda_array_info;
49 /* A group of array references in a kernel that should be handled together.
50 * If private_bound is not NULL, then it is mapped to registers.
51 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
52 * Otherwise, it is accessed from global memory.
54 struct cuda_array_ref_group {
55 /* The references in this group access this array. */
56 struct cuda_array_info *array;
57 /* Position of this group in the list of reference groups of array. */
58 int nr;
60 /* The following fields are use during the construction of the groups.
61 * access is the combined access relation relative to the shared
62 * memory tiling.
63 * write is set if any access in the group is a write.
65 isl_map *access;
66 int write;
68 /* For each index, size and offset of piece in shared memory. */
69 struct cuda_array_bound *shared_bound;
71 /* For each index, size and offset of piece in private memory. */
72 struct cuda_array_bound *private_bound;
74 /* References in this group; point to elements of a linked list. */
75 int n_ref;
76 struct cuda_stmt_access **refs;
79 struct cuda_array_info {
80 isl_space *dim;
81 /* Element type. */
82 char *type;
83 /* Name of the array. */
84 char *name;
85 /* Number of indices. */
86 unsigned n_index;
87 /* For each index, a bound on the array in that direction. */
88 isl_pw_aff **bound;
89 /* For each index, bound[i] specialized to the current kernel. */
90 isl_pw_aff **local_bound;
92 /* All references to this array; point to elements of a linked list. */
93 int n_ref;
94 struct cuda_stmt_access **refs;
96 /* The reference groups associated to this array. */
97 int n_group;
98 struct cuda_array_ref_group **groups;
100 /* For scalars, is this scalar read-only within the entire program? */
101 int read_only;
103 /* Last shared memory tile dimension that affects tile of this array. */
104 int last_shared;
105 /* Dimension at which copying to/from shared memory is printed.
106 * if >= 0, then the value is >= last_shared
107 * if -1, then the copying is done at the leaf level.
109 int print_shared_level;
112 /* Print the name of the local copy of a given group of array references.
114 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
116 int global = 0;
118 if (group->private_bound)
119 fprintf(out, "private_");
120 else if (group->shared_bound)
121 fprintf(out, "shared_");
122 else
123 global = 1;
124 fprintf(out, "%s", group->array->name);
125 if (!global && group->array->n_group > 1)
126 fprintf(out, "_%d", group->nr);
129 /* Collect all references to the given array and store pointers to them
130 * in array->refs.
132 static void collect_references(struct cuda_gen *gen,
133 struct cuda_array_info *array)
135 int i;
136 int n;
138 n = 0;
139 for (i = 0; i < gen->n_stmts; ++i) {
140 struct cuda_stmt *stmt = &gen->stmts[i];
141 struct cuda_stmt_access *access;
143 for (access = stmt->accesses; access; access = access->next) {
144 const char *name;
145 name = isl_map_get_tuple_name(access->access,
146 isl_dim_out);
147 if (name && !strcmp(array->name, name))
148 n++;
152 array->n_ref = n;
153 array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
154 assert(array->refs);
156 n = 0;
157 for (i = 0; i < gen->n_stmts; ++i) {
158 struct cuda_stmt *stmt = &gen->stmts[i];
159 struct cuda_stmt_access *access;
161 for (access = stmt->accesses; access; access = access->next) {
162 const char *name;
163 name = isl_map_get_tuple_name(access->access,
164 isl_dim_out);
165 if (!name || strcmp(array->name, name))
166 continue;
168 array->refs[n++] = access;
173 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
175 int i;
176 struct cuda_array_bound *bound;
178 bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
179 assert(bound);
181 for (i = 0; i < n_index; ++i) {
182 isl_int_init(bound[i].size);
183 bound[i].lb = NULL;
184 isl_int_init(bound[i].stride);
185 bound[i].shift = NULL;
186 bound[i].shift_map = NULL;
189 return bound;
192 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
194 int j;
196 if (!bound)
197 return;
199 for (j = 0; j < n_index; ++j) {
200 isl_int_clear(bound[j].size);
201 isl_int_clear(bound[j].stride);
202 isl_aff_free(bound[j].lb);
203 isl_aff_free(bound[j].shift);
204 isl_basic_map_free(bound[j].shift_map);
206 free(bound);
209 static struct pet_array *find_array(struct pet_scop *scop,
210 __isl_keep isl_set *accessed)
212 int i;
213 isl_id *id;
215 id = isl_set_get_tuple_id(accessed);
217 for (i = 0; i < scop->n_array; ++i) {
218 isl_id *id_i;
220 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
221 isl_id_free(id_i);
222 if (id == id_i)
223 break;
225 isl_id_free(id);
227 return i < scop->n_array ? scop->arrays[i] : NULL;
230 /* Compute bounds on the host arrays based on the accessed elements
231 * and collect all references to the array.
233 * If the array is zero-dimensional, i.e., a scalar, we check
234 * whether it is read-only.
236 static int extract_array_info(__isl_take isl_set *array, void *user)
238 int i;
239 struct cuda_gen *gen = (struct cuda_gen *)user;
240 const char *name;
241 int n_index;
242 isl_pw_aff **bounds;
243 isl_pw_aff **local_bounds;
244 struct pet_array *pa;
246 n_index = isl_set_dim(array, isl_dim_set);
247 name = isl_set_get_tuple_name(array);
248 bounds = isl_alloc_array(isl_set_get_ctx(array),
249 isl_pw_aff *, n_index);
250 assert(bounds);
251 local_bounds = isl_calloc_array(isl_set_get_ctx(array),
252 isl_pw_aff *, n_index);
253 assert(local_bounds);
254 gen->array[gen->n_array].dim = isl_set_get_space(array);
255 gen->array[gen->n_array].name = strdup(name);
256 gen->array[gen->n_array].n_index = n_index;
257 gen->array[gen->n_array].bound = bounds;
258 gen->array[gen->n_array].local_bound = local_bounds;
260 pa = find_array(gen->scop, array);
261 assert(pa);
263 gen->array[gen->n_array].type = strdup(pa->element_type);
265 if (n_index == 0) {
266 isl_set *space;
267 isl_union_map *write;
268 int empty;
270 write = isl_union_map_copy(gen->write);
271 space = isl_set_universe(isl_set_get_space(array));
272 write = isl_union_map_intersect_range(write,
273 isl_union_set_from_set(space));
274 empty = isl_union_map_is_empty(write);
275 isl_union_map_free(write);
277 gen->array[gen->n_array].read_only = empty;
280 for (i = 0; i < n_index; ++i) {
281 isl_set *dom;
282 isl_local_space *ls;
283 isl_aff *one;
284 isl_pw_aff *bound;
285 isl_set *size = i == 0 ? array : pa->extent;
287 bound = isl_set_dim_max(isl_set_copy(size), i);
288 assert(bound);
289 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
290 ls = isl_local_space_from_space(isl_set_get_space(dom));
291 one = isl_aff_zero_on_domain(ls);
292 one = isl_aff_add_constant_si(one, 1);
293 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
294 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
296 bounds[i] = bound;
299 collect_references(gen, &gen->array[gen->n_array]);
301 gen->n_array++;
303 isl_set_free(array);
304 return 0;
307 void collect_array_info(struct cuda_gen *gen)
309 isl_union_set *arrays;
311 arrays = isl_union_map_range(isl_union_map_copy(gen->read));
312 arrays = isl_union_set_union(arrays,
313 isl_union_map_range(isl_union_map_copy(gen->write)));
314 arrays = isl_union_set_coalesce(arrays);
316 gen->n_array = isl_union_set_n_set(arrays);
317 gen->array = isl_alloc_array(gen->ctx,
318 struct cuda_array_info, gen->n_array);
319 assert(gen->array);
320 gen->n_array = 0;
321 isl_union_set_foreach_set(arrays, &extract_array_info, gen);
322 isl_union_set_free(arrays);
325 static void free_array_info(struct cuda_gen *gen)
327 int i, j;
329 for (i = 0; i < gen->n_array; ++i) {
330 int n_index = gen->array[i].n_index;
331 free(gen->array[i].type);
332 free(gen->array[i].name);
333 for (j = 0; j < n_index; ++j) {
334 isl_pw_aff_free(gen->array[i].bound[j]);
335 isl_pw_aff_free(gen->array[i].local_bound[j]);
337 isl_space_free(gen->array[i].dim);
338 free(gen->array[i].bound);
339 free(gen->array[i].local_bound);
340 free(gen->array[i].refs);
342 free(gen->array);
345 /* Check if a cuda array is a scalar. A scalar is a value that is not stored
346 * as an array or through a pointer reference, but as single data element. At
347 * the moment, scalars are represented as zero dimensional arrays.
349 static int cuda_array_is_scalar(struct cuda_array_info *array)
351 return (array->n_index == 0);
354 /* Is "array" a read-only scalar?
356 static int cuda_array_is_read_only_scalar(struct cuda_array_info *array)
358 return cuda_array_is_scalar(array) && array->read_only;
361 static void declare_device_arrays(struct cuda_gen *gen)
363 int i;
365 for (i = 0; i < gen->n_array; ++i) {
366 if (cuda_array_is_read_only_scalar(&gen->array[i]))
367 continue;
368 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
369 gen->array[i].type, gen->array[i].name);
371 fprintf(gen->cuda.host_c, "\n");
374 static void print_array_size(struct cuda_gen *gen, FILE *out,
375 struct cuda_array_info *array)
377 int i;
378 isl_printer *prn;
380 prn = isl_printer_to_file(gen->ctx, out);
381 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
382 for (i = 0; i < array->n_index; ++i) {
383 prn = isl_printer_print_str(prn, "(");
384 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
385 prn = isl_printer_print_str(prn, ") * ");
387 prn = isl_printer_print_str(prn, "sizeof(");
388 prn = isl_printer_print_str(prn, array->type);
389 prn = isl_printer_print_str(prn, ")");
390 isl_printer_free(prn);
393 static void allocate_device_arrays(struct cuda_gen *gen)
395 int i;
397 for (i = 0; i < gen->n_array; ++i) {
398 if (cuda_array_is_read_only_scalar(&gen->array[i]))
399 continue;
400 fprintf(gen->cuda.host_c,
401 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
402 gen->array[i].name);
403 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
404 fprintf(gen->cuda.host_c, "));\n");
406 fprintf(gen->cuda.host_c, "\n");
409 static void free_device_arrays(struct cuda_gen *gen)
411 int i;
413 for (i = 0; i < gen->n_array; ++i) {
414 if (cuda_array_is_read_only_scalar(&gen->array[i]))
415 continue;
416 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaFree(dev_%s));\n",
417 gen->array[i].name);
421 static void copy_arrays_to_device(struct cuda_gen *gen)
423 int i;
425 for (i = 0; i < gen->n_array; ++i) {
426 isl_space *dim;
427 isl_set *read_i;
428 int empty;
430 if (cuda_array_is_read_only_scalar(&gen->array[i]))
431 continue;
433 dim = isl_space_copy(gen->array[i].dim);
434 read_i = isl_union_set_extract_set(gen->copy_in, dim);
435 empty = isl_set_fast_is_empty(read_i);
436 isl_set_free(read_i);
437 if (empty)
438 continue;
440 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(dev_%s,",
441 gen->array[i].name);
443 if (cuda_array_is_scalar(&(gen->array[i])))
444 fprintf(gen->cuda.host_c, " &%s, ",
445 gen->array[i].name);
446 else
447 fprintf(gen->cuda.host_c, " %s, ", gen->array[i].name);
449 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
450 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice));\n");
452 fprintf(gen->cuda.host_c, "\n");
455 static void copy_arrays_from_device(struct cuda_gen *gen)
457 int i;
458 isl_union_set *write;
459 write = isl_union_map_range(isl_union_map_copy(gen->write));
461 for (i = 0; i < gen->n_array; ++i) {
462 isl_space *dim;
463 isl_set *write_i;
464 int empty;
466 dim = isl_space_copy(gen->array[i].dim);
467 write_i = isl_union_set_extract_set(write, dim);
468 empty = isl_set_fast_is_empty(write_i);
469 isl_set_free(write_i);
470 if (empty)
471 continue;
473 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(");
474 if (cuda_array_is_scalar(&gen->array[i]))
475 fprintf(gen->cuda.host_c, "&%s, ", gen->array[i].name);
476 else
477 fprintf(gen->cuda.host_c, "%s, ", gen->array[i].name);
478 fprintf(gen->cuda.host_c, "dev_%s, ", gen->array[i].name);
479 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
480 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost));\n");
483 isl_union_set_free(write);
484 fprintf(gen->cuda.host_c, "\n");
487 static void read_sizes_from_file(struct cuda_gen *gen, const char *filename,
488 int *sizes, int len)
490 int i;
491 FILE *file;
493 file = fopen(filename, "r");
494 if (!file)
495 return;
497 for (i = 0; i < len; ++i)
498 if (fscanf(file, "%d", &sizes[i]) < 1)
499 break;
501 fclose(file);
504 /* Internal data structure for extract_size_of_type.
505 * "type" specifies the name of the space that we want to extract.
506 * "res" is used to store the subset of that space.
508 struct ppcg_extract_size_data {
509 const char *type;
510 isl_set *res;
513 /* This function is called for each set in a union_set.
514 * If the name of the set matches data->type, we store the
515 * set in data->res.
517 static int extract_size_of_type(__isl_take isl_set *size, void *user)
519 struct ppcg_extract_size_data *data = user;
520 const char *name;
522 name = isl_set_get_tuple_name(size);
523 if (name && !strcmp(name, data->type)) {
524 data->res = size;
525 return -1;
528 isl_set_free(size);
529 return 0;
532 /* Given a union map { kernel[i] -> *[...] },
533 * return the range in the space called "type" for the kernel with
534 * sequence number "id".
536 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
537 const char *type, int id)
539 isl_space *space;
540 isl_set *dom;
541 isl_union_set *local_sizes;
542 struct ppcg_extract_size_data data = { type, NULL };
544 if (!sizes)
545 return NULL;
547 space = isl_union_map_get_space(sizes);
548 space = isl_space_set_from_params(space);
549 space = isl_space_add_dims(space, isl_dim_set, 1);
550 space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
551 dom = isl_set_universe(space);
552 dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
554 local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
555 isl_union_map_copy(sizes));
556 isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
557 isl_union_set_free(local_sizes);
558 return data.res;
561 /* Given a singleton set, extract the first (at most *len) elements
562 * of the single integer tuple into *sizes and update *len if needed.
564 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
566 int i;
567 int dim;
568 isl_int v;
570 if (!set)
571 return;
573 dim = isl_set_dim(set, isl_dim_set);
574 if (dim < *len)
575 *len = dim;
577 isl_int_init(v);
579 for (i = 0; i < *len; ++i) {
580 int ok;
582 ok = isl_set_plain_is_fixed(set, isl_dim_set, i, &v);
583 assert(ok);
585 sizes[i] = isl_int_get_si(v);
588 isl_int_clear(v);
590 isl_set_free(set);
593 /* Extract user specified "tile" sizes from the "sizes" command line option,
594 * defaulting to option->tile_size in each dimension.
596 static void read_tile_sizes(struct cuda_gen *gen)
598 int n;
599 isl_set *size;
601 gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
602 assert(gen->tile_size);
603 for (n = 0; n < gen->tile_len; ++n)
604 gen->tile_size[n] = gen->options->tile_size;
606 size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
607 read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
609 if (gen->n_parallel > gen->tile_len)
610 gen->n_parallel = gen->tile_len;
613 /* Extract user specified "block" sizes from the "sizes" command line option,
614 * after filling in some potentially useful defaults.
616 static void read_block_sizes(struct cuda_gen *gen)
618 int n;
619 isl_set *size;
621 n = gen->n_parallel;
622 gen->n_block = (n <= 3) ? n : 3;
623 switch (gen->n_block) {
624 case 1:
625 gen->block_dim[0] = 512;
626 break;
627 case 2:
628 gen->block_dim[0] = 32;
629 gen->block_dim[1] = 16;
630 break;
631 default:
632 gen->block_dim[0] = 32;
633 gen->block_dim[1] = 4;
634 gen->block_dim[2] = 4;
635 break;
638 size = extract_sizes(gen->sizes, "block", gen->kernel_id);
639 read_sizes_from_set(size, gen->block_dim, &gen->n_block);
642 /* Extract user specified "grid" sizes from the "sizes" command line option,
643 * after filling in some potentially useful defaults.
645 static void read_grid_sizes(struct cuda_gen *gen)
647 int n = gen->n_parallel;
648 isl_set *size;
650 gen->n_grid = (n <= 2) ? n : 2;
651 switch (gen->n_grid) {
652 case 1:
653 gen->grid_dim[0] = 32768;
654 break;
655 default:
656 gen->grid_dim[0] = 256;
657 gen->grid_dim[1] = 256;
658 break;
661 size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
662 read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
665 /* Extract user specified sizes from the "sizes" command line option
666 * after filling in some potentially useful defaults.
668 static void read_sizes(struct cuda_gen *gen)
670 read_tile_sizes(gen);
671 read_block_sizes(gen);
672 read_grid_sizes(gen);
675 static void free_stmts(struct cuda_stmt *stmts, int n)
677 int i;
679 for (i = 0; i < n; ++i) {
680 struct cuda_stmt_access *access, *next;
682 for (access = stmts[i].accesses; access; access = next) {
683 next = access->next;
684 isl_map_free(access->access);
685 free(access);
688 isl_set_free(stmts[i].domain);
690 free(stmts);
693 void clear_cuda_gen(struct cuda_gen *gen)
695 free_stmts(gen->stmts, gen->n_stmts);
696 free_array_info(gen);
697 isl_union_map_free(gen->sizes);
698 isl_set_free(gen->context);
699 isl_union_set_free(gen->copy_in);
700 isl_union_map_free(gen->sched);
701 isl_union_map_free(gen->read);
702 isl_union_map_free(gen->write);
705 static void print_reverse_list(FILE *out, int len, int *list)
707 int i;
709 if (len == 0)
710 return;
712 fprintf(out, "(");
713 for (i = 0; i < len; ++i) {
714 if (i)
715 fprintf(out, ", ");
716 fprintf(out, "%d", list[len - 1 - i]);
718 fprintf(out, ")");
721 static void print_kernel_launch(struct cuda_gen *gen,
722 __isl_keep isl_union_set *arrays)
724 int i;
725 int first = 1;
726 unsigned nparam;
727 isl_space *dim;
729 print_indent(gen->code.dst, gen->code.indent);
730 fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
731 gen->kernel_id, gen->kernel_id, gen->kernel_id);
732 fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
733 gen->kernel_id);
734 fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
735 gen->kernel_id);
737 for (i = 0; i < gen->n_array; ++i) {
738 isl_space *dim;
739 isl_set *arr;
740 int empty;
742 dim = isl_space_copy(gen->array[i].dim);
743 arr = isl_union_set_extract_set(arrays, dim);
744 empty = isl_set_fast_is_empty(arr);
745 isl_set_free(arr);
746 if (empty)
747 continue;
749 if (!first) {
750 fprintf(gen->code.dst, ", ");
751 fprintf(gen->cuda.kernel_c, ", ");
752 fprintf(gen->cuda.kernel_h, ", ");
755 if (cuda_array_is_read_only_scalar(&gen->array[i])) {
756 fprintf(gen->code.dst, "%s", gen->array[i].name);
757 fprintf(gen->cuda.kernel_c, "%s %s",
758 gen->array[i].type, gen->array[i].name);
759 fprintf(gen->cuda.kernel_h, "%s %s",
760 gen->array[i].type, gen->array[i].name);
761 } else {
762 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
763 fprintf(gen->cuda.kernel_c, "%s *%s",
764 gen->array[i].type, gen->array[i].name);
765 fprintf(gen->cuda.kernel_h, "%s *%s",
766 gen->array[i].type, gen->array[i].name);
769 first = 0;
772 dim = isl_union_set_get_space(arrays);
773 nparam = isl_space_dim(dim, isl_dim_param);
774 for (i = 0; i < nparam; ++i) {
775 const char *name = isl_space_get_dim_name(dim, isl_dim_param, i);
776 if (!first) {
777 fprintf(gen->code.dst, ", ");
778 fprintf(gen->cuda.kernel_c, ", ");
779 fprintf(gen->cuda.kernel_h, ", ");
781 fprintf(gen->code.dst, "%s", name);
782 fprintf(gen->cuda.kernel_c, "int %s", name);
783 fprintf(gen->cuda.kernel_h, "int %s", name);
784 first = 0;
786 isl_space_free(dim);
788 for (i = 0; i < gen->tile_first; ++i) {
789 if (!first) {
790 fprintf(gen->code.dst, ", ");
791 fprintf(gen->cuda.kernel_c, ", ");
792 fprintf(gen->cuda.kernel_h, ", ");
794 fprintf(gen->code.dst, "h%d", i);
795 fprintf(gen->cuda.kernel_c, "int h%d", i);
796 fprintf(gen->cuda.kernel_h, "int h%d", i);
797 first = 0;
800 fprintf(gen->code.dst, ");\n");
801 fprintf(gen->cuda.kernel_c, ")\n");
802 fprintf(gen->cuda.kernel_h, ");\n");
804 fprintf(gen->code.dst, "cudaCheckKernel();\n");
807 /* Construct a map from a domain of dimensionality "len"
808 * to a domain of dimensionality "len" + "tile_len" that tiles
809 * the "tile_len" coordinates starting at "first".
810 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
811 * "dim" prescribes the parameters.
813 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
814 int first, int tile_len, int *tile_size)
816 int i;
817 isl_int v;
818 isl_basic_map *bmap;
819 isl_constraint *c;
820 isl_local_space *ls;
822 isl_int_init(v);
824 dim = isl_space_add_dims(dim, isl_dim_in, len);
825 dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
826 bmap = isl_basic_map_universe(isl_space_copy(dim));
827 ls = isl_local_space_from_space(dim);
829 for (i = 0; i < len - tile_len; ++i) {
830 int j = i < first ? i : i + tile_len;
831 int k = i < first ? i : i + 2 * tile_len;
833 c = isl_equality_alloc(isl_local_space_copy(ls));
834 isl_int_set_si(v, -1);
835 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
836 isl_int_set_si(v, 1);
837 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
838 bmap = isl_basic_map_add_constraint(bmap, c);
841 for (i = 0; i < tile_len; ++i) {
842 c = isl_equality_alloc(isl_local_space_copy(ls));
843 isl_int_set_si(v, -1);
844 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
845 isl_int_set_si(v, tile_size[i]);
846 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
847 isl_int_set_si(v, 1);
848 isl_constraint_set_coefficient(c, isl_dim_out,
849 first + i + tile_len, v);
850 bmap = isl_basic_map_add_constraint(bmap, c);
852 c = isl_inequality_alloc(isl_local_space_copy(ls));
853 isl_int_set_si(v, 1);
854 isl_constraint_set_coefficient(c, isl_dim_out,
855 first + i + tile_len, v);
856 bmap = isl_basic_map_add_constraint(bmap, c);
858 c = isl_inequality_alloc(isl_local_space_copy(ls));
859 isl_int_set_si(v, -1);
860 isl_constraint_set_coefficient(c, isl_dim_out,
861 first + i + tile_len, v);
862 isl_int_set_si(v, tile_size[i] - 1);
863 isl_constraint_set_constant(c, v);
864 bmap = isl_basic_map_add_constraint(bmap, c);
867 isl_local_space_free(ls);
868 isl_int_clear(v);
870 return isl_map_from_basic_map(bmap);
873 /* Construct a map from a domain of dimensionality "len"
874 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
875 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
876 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
877 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
878 * that are projected out at the end.
879 * "dim" prescribes the parameters.
881 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
882 int first, int wrap_len, int *wrap_size)
884 int i;
885 isl_basic_map *bmap;
886 isl_constraint *c;
887 isl_local_space *ls;
889 dim = isl_space_add_dims(dim, isl_dim_in, len);
890 dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
891 bmap = isl_basic_map_universe(isl_space_copy(dim));
892 ls = isl_local_space_from_space(dim);
894 for (i = 0; i < len; ++i) {
895 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
897 c = isl_equality_alloc(isl_local_space_copy(ls));
898 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
899 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
900 bmap = isl_basic_map_add_constraint(bmap, c);
903 for (i = 0; i < wrap_len; ++i) {
904 c = isl_equality_alloc(isl_local_space_copy(ls));
905 isl_constraint_set_coefficient_si(c, isl_dim_out,
906 first + i, -1);
907 isl_constraint_set_coefficient_si(c, isl_dim_out,
908 first + wrap_len + i, 1);
909 isl_constraint_set_coefficient_si(c, isl_dim_out,
910 first + 2 * wrap_len + i, wrap_size[i]);
911 bmap = isl_basic_map_add_constraint(bmap, c);
913 c = isl_inequality_alloc(isl_local_space_copy(ls));
914 isl_constraint_set_coefficient_si(c, isl_dim_out,
915 first + wrap_len + i, 1);
916 bmap = isl_basic_map_add_constraint(bmap, c);
918 c = isl_inequality_alloc(isl_local_space_copy(ls));
919 isl_constraint_set_coefficient_si(c, isl_dim_out,
920 first + wrap_len + i, -1);
921 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
922 bmap = isl_basic_map_add_constraint(bmap, c);
925 isl_local_space_free(ls);
927 bmap = isl_basic_map_project_out(bmap, isl_dim_out,
928 first + 2 * wrap_len, wrap_len);
930 return isl_map_from_basic_map(bmap);
933 /* Add "n" parameters named prefix%d.
935 static __isl_give isl_set *add_params( __isl_take isl_set *set,
936 int n, const char *prefix)
938 int i;
939 unsigned nparam;
940 char name[20];
942 nparam = isl_set_dim(set, isl_dim_param);
943 set = isl_set_add_dims(set, isl_dim_param, n);
945 for (i = 0; i < n; ++i) {
946 snprintf(name, sizeof(name), "%s%d", prefix, i);
947 set = isl_set_set_dim_name(set, isl_dim_param,
948 nparam + i, name);
951 return set;
954 /* Equate the "n" dimensions of "set" starting at "first" to
955 * freshly created parameters named prefix%d.
957 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
958 int first, int n, const char *prefix)
960 int i;
961 unsigned nparam;
962 isl_int v;
963 isl_space *dim;
964 isl_basic_set *bset;
965 isl_constraint *c;
966 isl_local_space *ls;
968 nparam = isl_set_dim(set, isl_dim_param);
970 set = add_params(set, n, prefix);
972 dim = isl_set_get_space(set);
973 bset = isl_basic_set_universe(isl_space_copy(dim));
974 ls = isl_local_space_from_space(dim);
976 isl_int_init(v);
978 for (i = 0; i < n; ++i) {
979 c = isl_equality_alloc(isl_local_space_copy(ls));
980 isl_int_set_si(v, -1);
981 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
982 isl_int_set_si(v, 1);
983 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
984 bset = isl_basic_set_add_constraint(bset, c);
987 isl_int_clear(v);
988 isl_local_space_free(ls);
990 return isl_set_intersect(set, isl_set_from_basic_set(bset));
993 static __isl_give isl_set *parametrization(__isl_take isl_space *dim,
994 int len, int first, int n, const char *prefix)
996 isl_set *set;
998 dim = isl_space_add_dims(dim, isl_dim_set, len);
999 set = isl_set_universe(dim);
1001 return parametrize(set, first, n, prefix);
1004 /* Tile the B loops over the tile sizes and then tile/wrap
1005 * the T1 loops over the blocks.
1007 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
1008 __isl_take isl_union_map *sched)
1010 isl_space *dim;
1011 isl_map *tiling, *block_tiling;
1013 dim = isl_union_map_get_space(sched);
1014 tiling = tile(isl_space_copy(dim), gen->untiled_len,
1015 gen->tile_first, gen->tile_len, gen->tile_size);
1017 if (gen->options->wrap)
1018 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1019 gen->tile_first, gen->n_grid, gen->grid_dim);
1020 else
1021 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1022 gen->tile_first, gen->n_grid, gen->grid_dim);
1024 gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1026 tiling = isl_map_apply_range(tiling, block_tiling);
1028 sched = isl_union_map_apply_range(sched,
1029 isl_union_map_from_map(tiling));
1031 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1033 return sched;
1036 static __isl_give isl_union_map *parametrize_tiled_schedule(
1037 struct cuda_gen *gen, __isl_take isl_union_map *sched)
1039 isl_space *dim;
1040 isl_set *par;
1042 dim = isl_union_map_get_space(sched);
1043 par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
1044 sched = isl_union_map_intersect_range(sched,
1045 isl_union_set_from_set(par));
1047 dim = isl_union_map_get_space(sched);
1048 par = parametrization(dim, gen->tiled_len,
1049 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1050 sched = isl_union_map_intersect_range(sched,
1051 isl_union_set_from_set(par));
1053 return sched;
1056 /* Tile/wrap the P1 loops over the threads.
1058 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
1059 __isl_take isl_union_map *sched)
1061 isl_space *dim;
1062 isl_map *tiling;
1063 isl_set *par;
1065 dim = isl_union_map_get_space(sched);
1067 if (gen->options->wrap)
1068 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1069 gen->shared_len, gen->n_block, gen->block_dim);
1070 else
1071 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1072 gen->shared_len, gen->n_block, gen->block_dim);
1073 gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1075 sched = isl_union_map_apply_range(sched,
1076 isl_union_map_from_map(tiling));
1078 par = parametrization(dim, gen->thread_tiled_len,
1079 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1080 gen->n_block, "t");
1081 sched = isl_union_map_intersect_range(sched,
1082 isl_union_set_from_set(par));
1084 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1086 return sched;
1089 /* If the user asked for it, scale the shared memory tile loops
1090 * (T1T and T2) of "sched" by gen->tile_size[i].
1091 * If we are not performing "wrapping", then additionally scale the T1P
1092 * loops by gen->grid_dim[i].
1094 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
1095 __isl_take isl_union_map *sched)
1097 int i;
1098 isl_space *dim;
1099 isl_basic_map *scale;
1100 isl_constraint *c;
1101 isl_local_space *ls;
1103 if (!gen->options->scale_tile_loops)
1104 return sched;
1106 dim = isl_union_map_get_space(sched);
1107 dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1108 dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1109 scale = isl_basic_map_universe(isl_space_copy(dim));
1110 ls = isl_local_space_from_space(dim);
1112 for (i = 0; i < gen->tiled_len; ++i) {
1113 int f = 1;
1115 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1116 f = gen->tile_size[i - gen->tile_first];
1117 if (!gen->options->wrap)
1118 f *= gen->grid_dim[i - gen->tile_first];
1119 } else if (i >= gen->tile_first + gen->n_grid &&
1120 i < gen->tile_first + gen->n_grid + gen->tile_len) {
1121 f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1124 c = isl_equality_alloc(isl_local_space_copy(ls));
1125 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1126 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1127 scale = isl_basic_map_add_constraint(scale, c);
1130 isl_local_space_free(ls);
1132 sched = isl_union_map_apply_range(sched,
1133 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1135 return sched;
1138 /* If we are not performing "wrapping" and if the user asked for it,
1139 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1141 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
1142 __isl_take isl_union_map *sched)
1144 int i;
1145 isl_space *dim;
1146 isl_basic_map *scale;
1147 isl_constraint *c;
1148 isl_local_space *ls;
1150 if (gen->options->wrap)
1151 return sched;
1152 if (!gen->options->scale_tile_loops)
1153 return sched;
1155 dim = isl_union_map_get_space(sched);
1156 dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1157 dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1158 scale = isl_basic_map_universe(isl_space_copy(dim));
1159 ls = isl_local_space_from_space(dim);
1161 for (i = 0; i < gen->thread_tiled_len; ++i) {
1162 int f = 1;
1164 if (i >= gen->shared_len &&
1165 i < gen->shared_len + gen->n_block)
1166 f = gen->block_dim[i - gen->shared_len];
1168 c = isl_equality_alloc(isl_local_space_copy(ls));
1169 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1170 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1171 scale = isl_basic_map_add_constraint(scale, c);
1174 isl_local_space_free(ls);
1176 sched = isl_union_map_apply_range(sched,
1177 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1179 return sched;
1182 /* If we are not performing "wrapping" and if the user asked for it,
1183 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1185 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
1186 __isl_take isl_union_map *sched, int len, int first, int n_tile)
1188 int i;
1189 isl_space *dim;
1190 isl_basic_map *scale;
1191 isl_constraint *c;
1192 isl_local_space *ls;
1194 if (gen->options->wrap)
1195 return sched;
1196 if (!gen->options->scale_tile_loops)
1197 return sched;
1199 dim = isl_union_map_get_space(sched);
1200 dim = isl_space_add_dims(dim, isl_dim_in, len);
1201 dim = isl_space_add_dims(dim, isl_dim_out, len);
1202 scale = isl_basic_map_universe(isl_space_copy(dim));
1203 ls = isl_local_space_from_space(dim);
1205 for (i = 0; i < len; ++i) {
1206 int f = 1;
1208 if (i >= first && i < first + n_tile)
1209 f = gen->block_dim[i - first];
1211 c = isl_equality_alloc(isl_local_space_copy(ls));
1212 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1213 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1214 scale = isl_basic_map_add_constraint(scale, c);
1217 isl_local_space_free(ls);
1219 sched = isl_union_map_apply_range(sched,
1220 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1222 return sched;
1225 /* If print_user_stmt is set, we want to print the statements ourselves,
1226 * instead of relying on the C preprocessor. If so, we need to use
1227 * the stop option so that the domains will be saved on the statement
1228 * nodes.
1230 static void print_cloog_shared_body(struct cuda_gen *gen,
1231 __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1232 void (*print_user_stmt)(struct gpucode_info *info,
1233 struct clast_user_stmt *s),
1234 int first_unroll)
1236 int i;
1237 CloogOptions *options;
1238 CloogDomain *cloog_context;
1239 CloogUnionDomain *ud;
1240 CloogInput *input;
1241 struct clast_stmt *stmt;
1242 char name[20];
1244 sched = isl_union_map_copy(sched);
1245 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
1247 options = cloog_options_malloc(gen->state);
1248 options->language = CLOOG_LANGUAGE_C;
1249 options->strides = 1;
1250 options->sh = 1;
1251 options->f = len;
1252 options->l = -1;
1253 options->override = 1;
1254 options->save_domains = 1;
1255 options->noscalars = 1;
1256 options->first_unroll = first_unroll;
1258 ud = cloog_union_domain_from_isl_union_map(sched);
1259 for (i = 0; i < len; ++i) {
1260 snprintf(name, sizeof(name), "c%d", i);
1261 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1263 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1264 input = cloog_input_alloc(cloog_context, ud);
1266 stmt = cloog_clast_create_from_input(input, options);
1268 gen->stmt_code.indent = gen->kernel_code.indent;
1269 gen->stmt_code.dst = gen->cuda.kernel_c;
1270 gen->stmt_code.print_user_stmt = print_user_stmt;
1271 gen->stmt_code.print_user_stmt_list = NULL;
1272 gen->stmt_code.print_for_head = NULL;
1273 gen->stmt_code.print_for_foot = NULL;
1274 gen->stmt_code.user = gen;
1275 gpu_print_host_stmt(&gen->stmt_code, stmt);
1277 cloog_clast_free(stmt);
1278 cloog_options_free(options);
1281 /* Add "len" parameters p[i] called prefix%d,
1282 * with bounds to 0 <= p[i] < size[i].
1284 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1285 int len, int *size, const char *prefix)
1287 int i;
1288 unsigned nparam;
1289 isl_int v;
1290 isl_space *dim;
1291 isl_basic_set *bset;
1292 isl_constraint *c;
1293 isl_local_space *ls;
1294 char name[20];
1296 nparam = isl_set_dim(set, isl_dim_param);
1297 set = isl_set_add_dims(set, isl_dim_param, len);
1299 for (i = 0; i < len; ++i) {
1300 snprintf(name, sizeof(name), "%s%d", prefix, i);
1301 set = isl_set_set_dim_name(set, isl_dim_param,
1302 nparam + i, name);
1305 dim = isl_set_get_space(set);
1306 bset = isl_basic_set_universe(isl_space_copy(dim));
1307 ls = isl_local_space_from_space(dim);
1309 isl_int_init(v);
1311 for (i = 0; i < len; ++i) {
1312 c = isl_inequality_alloc(isl_local_space_copy(ls));
1313 isl_int_set_si(v, 1);
1314 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1315 bset = isl_basic_set_add_constraint(bset, c);
1317 c = isl_inequality_alloc(isl_local_space_copy(ls));
1318 isl_int_set_si(v, -1);
1319 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1320 isl_int_set_si(v, size[i] - 1);
1321 isl_constraint_set_constant(c, v);
1322 bset = isl_basic_set_add_constraint(bset, c);
1325 isl_int_clear(v);
1326 isl_local_space_free(ls);
1328 return isl_set_intersect(set, isl_set_from_basic_set(bset));
1331 static void print_shared_body(struct cuda_gen *gen,
1332 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1333 int len, void (*print_user_stmt)(struct gpucode_info *info,
1334 struct clast_user_stmt *s),
1335 int first_unroll)
1337 isl_set *context;
1339 context = isl_set_copy(shared_domain);
1340 context = parametrize(context, 0, gen->shared_len, "g");
1341 context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1342 context = add_bounded_parameters(context,
1343 gen->n_block, gen->block_dim, "t");
1345 print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1346 first_unroll);
1348 isl_set_free(context);
1351 /* Given a tile of an array, construct a map that maps each element
1352 * of the tile to a copy of the tile shifted to the origin
1353 * (based on the lower bounds in group->private_bound or group->shared_bound).
1354 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1355 * is applied to the index first.
1356 * The domain of the resulting map is "access",
1357 * while the range space is anonymous.
1359 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1360 struct cuda_array_ref_group *group)
1362 int i;
1363 isl_space *dim;
1364 isl_basic_set *bset;
1365 isl_basic_map *bmap;
1366 isl_aff *lb;
1367 isl_basic_set *offset;
1368 isl_basic_map *shift;
1369 isl_basic_map *pre_shift;
1370 isl_map *sched;
1371 const char *name;
1372 struct cuda_array_bound *bounds;
1373 int n_index = group->array->n_index;
1375 bounds = group->private_bound;
1376 if (!bounds)
1377 bounds = group->shared_bound;
1379 dim = isl_set_get_space(access);
1380 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1381 offset = isl_basic_set_universe(dim);
1382 for (i = 0; i < n_index; ++i) {
1383 lb = isl_aff_copy(bounds[i].lb);
1384 bmap = isl_basic_map_from_aff(lb);
1385 bset = isl_basic_map_range(bmap);
1386 offset = isl_basic_set_flat_product(offset, bset);
1388 offset = isl_basic_set_neg(offset);
1390 dim = isl_space_map_from_set(isl_set_get_space(access));
1391 shift = isl_basic_map_identity(dim);
1392 shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1394 bset = isl_basic_set_universe(isl_set_get_space(access));
1395 bmap = isl_basic_map_from_domain_and_range(bset, offset);
1397 shift = isl_basic_map_sum(shift, bmap);
1399 dim = isl_set_get_space(access);
1400 dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1401 dim = isl_space_map_from_set(dim);
1402 pre_shift = isl_basic_map_universe(isl_space_copy(dim));
1403 dim = isl_space_add_dims(dim, isl_dim_in, 1);
1404 dim = isl_space_add_dims(dim, isl_dim_out, 1);
1405 for (i = 0; i < n_index; ++i) {
1406 if (!bounds[i].shift_map)
1407 bmap = isl_basic_map_identity(isl_space_copy(dim));
1408 else
1409 bmap = isl_basic_map_copy(bounds[i].shift_map);
1410 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1412 isl_space_free(dim);
1413 name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1414 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1415 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1416 shift = isl_basic_map_apply_range(pre_shift, shift);
1418 sched = isl_map_from_basic_map(shift);
1419 sched = isl_map_intersect_domain(sched, access);
1421 return sched;
1424 /* Construct a schedule for iterating over all elements in the given
1425 * piece of an array. The schedule iterates over a copy of the piece
1426 * that is shifted to the origin.
1427 * We subsequently also perform the tiling/wrapping over the threads.
1429 * In particular, we tile the final iterators so that the final thread
1430 * dimension runs over the final array dimension.
1431 * However, if those final iterators have only a single iteration,
1432 * we try to tile earlier iterators instead.
1434 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1435 __isl_take isl_set *access, struct cuda_array_ref_group *group)
1437 isl_space *dim;
1438 isl_map *sched;
1439 isl_union_map *usched;
1440 isl_map *tiling;
1441 isl_set *par;
1442 unsigned nvar = isl_set_dim(access, isl_dim_set);
1443 int n_tile;
1444 int first;
1446 sched = shift_access(access, group);
1448 n_tile = gen->n_block;
1449 if (n_tile > nvar) {
1450 int i;
1451 sched = isl_map_insert_dims(sched,
1452 isl_dim_out, 0, n_tile - nvar);
1453 for (i = 0; i < n_tile - nvar; ++i)
1454 sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1455 nvar = n_tile;
1458 first = nvar - n_tile;
1460 for (; first > 0; first --)
1461 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1462 first + n_tile - 1, NULL))
1463 break;
1465 dim = isl_map_get_space(sched);
1466 dim = isl_space_params(dim);
1467 if (gen->options->wrap)
1468 tiling = wrap(isl_space_copy(dim), nvar, first,
1469 n_tile, gen->block_dim);
1470 else
1471 tiling = tile(isl_space_copy(dim), nvar, first,
1472 n_tile, gen->block_dim);
1473 sched = isl_map_apply_range(sched, tiling);
1475 par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1476 usched = isl_union_map_from_map(sched);
1477 usched = isl_union_map_intersect_range(usched,
1478 isl_union_set_from_set(par));
1480 usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1481 first, n_tile);
1483 return usched;
1486 /* Print an access to the element in the global memory copy of the
1487 * given array that corresponds to the element described by "pma".
1488 * of the original array.
1489 * The copy in global memory has been linearized, so we need to take
1490 * the array size into account.
1492 static void print_global_index(FILE *out,
1493 struct cuda_array_info *array, __isl_keep isl_pw_multi_aff *pma)
1495 int i;
1496 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1497 isl_printer *prn;
1499 if (cuda_array_is_scalar(array)) {
1500 fprintf(out, "*%s", array->name);
1501 return;
1504 fprintf(out, "%s[", array->name);
1505 prn = isl_printer_to_file(ctx, out);
1506 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1507 for (i = 0; i + 1 < array->n_index; ++i)
1508 prn = isl_printer_print_str(prn, "(");
1509 for (i = 0; i < array->n_index; ++i) {
1510 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1511 pa = isl_pw_aff_coalesce(pa);
1512 if (i) {
1513 prn = isl_printer_print_str(prn, ") * (");
1514 prn = isl_printer_print_pw_aff(prn,
1515 array->local_bound[i]);
1516 prn = isl_printer_print_str(prn, ") + ");
1518 prn = isl_printer_print_pw_aff(prn, pa);
1519 isl_pw_aff_free(pa);
1521 isl_printer_free(prn);
1522 fprintf(out, "]");
1525 /* Given an index expression into a tile of an array, adjust the expression
1526 * to a shift of the tile to the origin
1527 * (based on the lower bounds in array->shared_bound).
1528 * If the index is strided, then we first add
1529 * bound->shift and divide by bound->stride.
1531 static __isl_give isl_pw_aff *shift_index(__isl_take isl_pw_aff *pa,
1532 struct cuda_array_info *array,
1533 struct cuda_array_bound *bound, __isl_take isl_set *domain)
1535 isl_aff *lb;
1536 isl_pw_aff *tmp;
1538 if (bound->shift) {
1539 isl_aff *shift;
1540 shift = bound->shift;
1541 shift = isl_aff_copy(shift);
1542 shift = isl_aff_project_domain_on_params(shift);
1543 shift = isl_aff_align_params(shift, isl_pw_aff_get_space(pa));
1544 tmp = isl_pw_aff_alloc(isl_set_copy(domain), shift);
1545 pa = isl_pw_aff_add(pa, tmp);
1546 pa = isl_pw_aff_scale_down(pa, bound->stride);
1549 lb = isl_aff_copy(bound->lb);
1550 lb = isl_aff_project_domain_on_params(lb);
1552 lb = isl_aff_align_params(lb, isl_pw_aff_get_space(pa));
1554 tmp = isl_pw_aff_alloc(isl_set_copy(domain), lb);
1555 pa = isl_pw_aff_sub(pa, tmp);
1556 pa = isl_pw_aff_coalesce(pa);
1557 pa = isl_pw_aff_gist(pa, domain);
1559 return pa;
1562 /* Print an access to the element in the private/shared memory copy of the
1563 * given array reference group that corresponds to the element described
1564 * by "pma" of the original array.
1565 * Since the array in private/shared memory is just a shifted copy of part
1566 * of the original array, we simply need to subtract the lower bound,
1567 * which was computed in can_tile_for_shared_memory.
1568 * If any of the indices is strided, then we first add
1569 * bounds[i].shift and divide by bounds[i].stride.
1571 static void print_local_index(FILE *out,
1572 struct cuda_array_ref_group *group, struct cuda_array_bound *bounds,
1573 __isl_keep isl_pw_multi_aff *pma, __isl_keep isl_set *domain)
1575 int i;
1576 isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1577 isl_printer *prn;
1578 struct cuda_array_info *array = group->array;
1580 print_array_name(out, group);
1581 for (i = 0; i < array->n_index; ++i) {
1582 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1584 pa = shift_index(pa, array, &bounds[i], isl_set_copy(domain));
1586 fprintf(out, "[");
1587 prn = isl_printer_to_file(ctx, out);
1588 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1589 prn = isl_printer_print_pw_aff(prn, pa);
1590 isl_printer_free(prn);
1591 fprintf(out, "]");
1592 isl_pw_aff_free(pa);
1596 /* This function is called for each leaf in the clast of the code
1597 * for copying to or from shared/private memory.
1598 * The statement name is {read,write}_{shared,private}_<array>.
1600 * The schedule iterates over the array elements, so we can use
1601 * the domain of copy_sched at the current scheduling position
1602 * as the index of the array.
1604 static void print_copy_statement(struct gpucode_info *code,
1605 struct clast_user_stmt *u)
1607 struct cuda_gen *gen = code->user;
1608 isl_set *domain;
1609 isl_map *sched;
1610 struct cuda_array_ref_group *group = gen->copy_group;
1611 struct cuda_array_bound *bounds = gen->copy_bound;
1612 int i;
1613 unsigned n_in;
1614 unsigned n_out;
1615 isl_space *dim;
1616 isl_set *param;
1617 isl_set *index;
1618 isl_pw_multi_aff *pma;
1619 int read;
1621 read = !strncmp(u->statement->name, "read", 4);
1623 domain = extract_host_domain(u);
1624 assert(domain);
1626 sched = isl_map_copy(gen->copy_sched);
1627 sched = isl_map_reverse(sched);
1628 sched = isl_map_intersect_domain(sched, domain);
1629 n_in = isl_map_dim(sched, isl_dim_in);
1630 n_out = isl_map_dim(sched, isl_dim_out);
1631 dim = isl_map_get_space(sched);
1632 dim = isl_space_drop_dims(dim, isl_dim_in, 0, n_in);
1633 dim = isl_space_drop_dims(dim, isl_dim_out, 0, n_out);
1634 param = parametrization(dim, n_in, 0, n_in, "c");
1635 sched = isl_map_align_params(sched, isl_set_get_space(param));
1636 sched = isl_map_intersect_domain(sched, param);
1637 index = isl_map_range(sched);
1638 domain = isl_set_copy(index);
1639 pma = isl_pw_multi_aff_from_set(index);
1640 pma = isl_pw_multi_aff_coalesce(pma);
1641 domain = isl_set_params(domain);
1643 print_indent(code->dst, code->indent);
1644 if (read) {
1645 print_local_index(code->dst, group, bounds, pma, domain);
1646 fprintf(code->dst, " = ");
1647 print_global_index(code->dst, group->array, pma);
1648 } else {
1649 print_global_index(code->dst, group->array, pma);
1650 fprintf(code->dst, " = ");
1651 print_local_index(code->dst, group, bounds, pma, domain);
1653 fprintf(code->dst, ";\n");
1655 isl_pw_multi_aff_free(pma);
1656 isl_set_free(domain);
1659 static void print_shared_access(struct cuda_gen *gen,
1660 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1661 const char *type, struct cuda_array_ref_group *group)
1663 const char *array_name;
1664 char *name;
1665 isl_ctx *ctx;
1666 isl_union_map *sched;
1667 unsigned nvar = isl_set_dim(access, isl_dim_set);
1668 int n_tile;
1670 ctx = isl_set_get_ctx(access);
1671 array_name = isl_set_get_tuple_name(access);
1672 name = isl_alloc_array(ctx, char,
1673 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1674 if (group->array->n_group > 1)
1675 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1676 else
1677 sprintf(name, "%s_shared_%s", type, array_name);
1678 access = isl_set_set_tuple_name(access, name);
1679 free(name);
1681 sched = access_schedule(gen, access, group);
1683 n_tile = gen->n_block;
1684 if (n_tile > nvar)
1685 n_tile = nvar;
1687 gen->copy_sched = isl_map_from_union_map(isl_union_map_copy(sched));
1688 gen->copy_group = group;
1689 gen->copy_bound = group->shared_bound;
1691 print_shared_body(gen, shared_domain, sched, nvar + n_tile,
1692 &print_copy_statement, -1);
1694 isl_union_map_free(sched);
1695 isl_map_free(gen->copy_sched);
1698 /* Return the union of all read (read = 1) and/or write (write = 1)
1699 * access relations in the group.
1701 static __isl_give isl_union_map *group_access_relation(
1702 struct cuda_array_ref_group *group, int read, int write)
1704 int i;
1705 isl_union_map *access;
1707 access = isl_union_map_empty(isl_map_get_space(group->access));
1708 for (i = 0; i < group->n_ref; ++i) {
1709 isl_map *map_i;
1711 if (!((read && group->refs[i]->read) ||
1712 (write && group->refs[i]->write)))
1713 continue;
1714 map_i = isl_map_copy(group->refs[i]->access);
1715 access = isl_union_map_union(access,
1716 isl_union_map_from_map(map_i));
1719 return access;
1722 /* Check that none of the shared memory tiles involve any strides.
1724 static int no_strides(struct cuda_array_ref_group *group)
1726 int i;
1727 int n_index = group->array->n_index;
1729 for (i = 0; i < n_index; ++i)
1730 if (group->shared_bound[i].shift)
1731 return 0;
1733 return 1;
1736 /* Return a set containing the values of the given index i
1737 * of the elements in the array tile in global memory that corresponds
1738 * to the shared memory copy.
1739 * In particular, if a is the index, we return a set with constraints
1741 * tile_offset <= a <= tile_offset + tile_size - 1
1743 * and
1745 * 0 <= a <= array_size - 1
1748 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1749 int i)
1751 isl_basic_set *tile;
1752 isl_aff *aff;
1753 isl_constraint *c;
1754 isl_local_space *ls;
1755 isl_pw_aff *bound;
1756 isl_set *dom;
1757 isl_set *tile_set;
1759 aff = isl_aff_copy(group->shared_bound[i].lb);
1760 aff = isl_aff_add_dims(aff, isl_dim_in, 1);
1761 ls = isl_aff_get_domain_local_space(aff);
1762 aff = isl_aff_neg(aff);
1763 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1764 c = isl_inequality_from_aff(isl_aff_copy(aff));
1765 tile = isl_basic_set_from_constraint(c);
1767 aff = isl_aff_neg(aff);
1768 aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1769 aff = isl_aff_add_constant_si(aff, -1);
1770 c = isl_inequality_from_aff(aff);
1771 tile = isl_basic_set_add_constraint(tile, c);
1773 aff = isl_aff_zero_on_domain(ls);
1774 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1775 c = isl_inequality_from_aff(aff);
1776 tile = isl_basic_set_add_constraint(tile, c);
1778 bound = isl_pw_aff_copy(group->array->bound[i]);
1779 bound = isl_pw_aff_add_dims(bound, isl_dim_in, 1);
1780 ls = isl_local_space_from_space(isl_pw_aff_get_domain_space(bound));
1781 aff = isl_aff_zero_on_domain(ls);
1782 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1783 aff = isl_aff_add_constant_si(aff, 1);
1784 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1786 tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1787 tile_set = isl_set_align_params(tile_set, isl_basic_set_get_space(tile));
1788 tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1790 return tile_set;
1793 /* Return a set containing the elements in the array tile in
1794 * global memory that corresponds to the shared memory copy.
1796 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1798 int i;
1799 int n_index = group->array->n_index;
1800 isl_set *tile;
1802 tile = group_tile_dim(group, 0);
1803 for (i = 1; i < n_index; ++i) {
1804 isl_set *tile_i;
1806 tile_i = group_tile_dim(group, i);
1807 tile = isl_set_flat_product(tile, tile_i);
1810 tile = isl_set_set_tuple_name(tile, group->array->name);
1812 return tile;
1815 /* Print code for reading into or writing from shared memory
1816 * the given array reference group.
1818 * sched maps the original iteration domains to the shared memory tile loops.
1820 * If we are performing a read from global memory to shared memory,
1821 * if the array involved is not a scalar and if the definition of the
1822 * shared memory tiles does not involve any strides, then we copy
1823 * the entire tile to shared memory. This may result in some extra
1824 * elements getting copied, but it should lead to simpler code
1825 * (which means that fewer registers may be needed) and less divergence.
1827 * Otherwise, we only copy the elements that will be read or have been written
1828 * in the kernel.
1830 * Note that the absence of stride requirement can easily be lifted.
1831 * We would just need to add constraints of the form
1833 * shift + a = stride * alpha
1835 static int print_group_shared_accesses(struct cuda_gen *gen,
1836 struct cuda_array_ref_group *group, const char *type,
1837 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1839 int read;
1840 isl_union_map *access;
1841 isl_union_set *uset;
1842 isl_set *access_set;
1844 if (group->private_bound)
1845 return 0;
1846 if (!group->shared_bound)
1847 return 0;
1849 read = !strcmp(type, "read");
1851 access = group_access_relation(group, read, !read);
1852 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1853 uset = isl_union_map_range(access);
1855 if (isl_union_set_is_empty(uset)) {
1856 isl_union_set_free(uset);
1857 return 0;
1860 if (read && group->array->n_index > 0 && no_strides(group)) {
1861 isl_union_set_free(uset);
1862 access_set = group_tile(group);
1863 print_shared_access(gen, shared_domain, access_set,
1864 type, group);
1865 return 1;
1868 access_set = isl_set_from_union_set(uset);
1869 access_set = isl_set_coalesce(access_set);
1871 print_shared_access(gen, shared_domain, access_set, type, group);
1873 return 1;
1876 /* Print code for reading into or writing from shared memory at
1877 * the given level (-1 for innermost).
1879 * If we are not printing at the innermost level, then the dimensionality
1880 * of shared_domain may be smaller than gen->shared_len.
1881 * As the rest of the code assumes that the domain of access has
1882 * gen->shared_len dimensions, we therefore may need to embed this domain
1883 * in a higher dimensional space after intersection with shared_domain.
1885 static void print_shared_accesses(struct cuda_gen *gen,
1886 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1887 const char *type, int level)
1889 int i, j;
1890 isl_space *dim;
1891 isl_map *proj;
1892 isl_set *par;
1893 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1894 int sync = 0;
1895 isl_union_map *sched;
1897 shared_domain = isl_set_copy(shared_domain);
1898 sched = isl_union_map_copy(gen->tiled_sched);
1899 dim = isl_union_map_get_space(sched);
1900 proj = projection(dim, gen->tiled_len, shared_len);
1901 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1902 sched = isl_union_map_intersect_range(sched,
1903 isl_union_set_from_set(isl_set_copy(shared_domain)));
1904 if (shared_len != gen->shared_len) {
1905 dim = isl_union_map_get_space(sched);
1906 proj = projection(dim, gen->shared_len, shared_len);
1907 proj = isl_map_reverse(proj);
1908 shared_domain = isl_set_apply(shared_domain,
1909 isl_map_copy(proj));
1910 sched = isl_union_map_apply_range(sched,
1911 isl_union_map_from_map(proj));
1914 dim = isl_union_map_get_space(sched);
1915 par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1916 sched = isl_union_map_intersect_range(sched,
1917 isl_union_set_from_set(par));
1919 for (i = 0; i < gen->n_array; ++i) {
1920 struct cuda_array_info *array = &gen->array[i];
1922 if (gen->array[i].print_shared_level != level)
1923 continue;
1925 for (j = 0; j < array->n_group; ++j) {
1926 if (print_group_shared_accesses(gen, array->groups[j],
1927 type, shared_domain, sched))
1928 sync = 1;
1932 isl_union_map_free(sched);
1933 isl_set_free(shared_domain);
1935 if (sync) {
1936 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1937 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1941 /* This function is called for each access to an array in some statement
1942 * in the original code.
1943 * Replace that access by an access to shared or (linearized) global memory.
1944 * Since the array in shared memory is just
1945 * a shifted copy of part of the original array, we simply need
1946 * to subtract the lower bound, which was computed
1947 * in can_tile_for_shared_memory.
1948 * If any of the indices is strided, then we first add
1949 * shared_bound[i].shift and divide by shared_bound[i].stride.
1951 * If the given array is accessed directly from global memory,
1952 * we don't need to perform any shifting and simply simplify
1953 * expression in the context of the domain instead.
1955 * If the array space (range of access) has no name, then we are
1956 * accessing an iterator in the original program.
1958 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1959 int group_nr)
1961 int i;
1962 const char *name;
1963 unsigned n_index;
1964 struct cuda_array_info *array = NULL;
1965 isl_printer *prn;
1966 isl_pw_multi_aff *pma;
1967 isl_set *data_set;
1968 isl_set *domain;
1969 struct cuda_array_bound *bounds = NULL;
1971 access = isl_map_align_params(access,
1972 isl_set_get_space(gen->stmt_domain));
1974 data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1976 name = isl_set_get_tuple_name(data_set);
1978 if (!name)
1979 fprintf(gen->cuda.kernel_c, "(");
1980 else {
1981 struct cuda_array_ref_group *group;
1983 for (i = 0; i < gen->n_array; ++i) {
1984 if (strcmp(name, gen->array[i].name))
1985 continue;
1986 array = &gen->array[i];
1988 assert(array);
1989 group = array->groups[group_nr];
1990 bounds = group->private_bound;
1991 if (!bounds)
1992 bounds = group->shared_bound;
1994 if (!bounds && cuda_array_is_scalar(array) && !array->read_only)
1995 fprintf(gen->cuda.kernel_c, "*");
1996 print_array_name(gen->cuda.kernel_c, group);
1998 if (cuda_array_is_scalar(array)) {
1999 isl_set_free(data_set);
2000 return;
2003 fprintf(gen->cuda.kernel_c, "[");
2007 n_index = isl_set_dim(data_set, isl_dim_set);
2008 pma = isl_pw_multi_aff_from_set(data_set);
2009 pma = isl_pw_multi_aff_coalesce(pma);
2011 prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
2012 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
2014 if (!bounds)
2015 for (i = 0; i + 1 < n_index; ++i)
2016 prn = isl_printer_print_str(prn, "(");
2018 for (i = 0; i < n_index; ++i) {
2019 isl_pw_aff *index;
2021 index = isl_pw_multi_aff_get_pw_aff(pma, i);
2023 if (!array) {
2024 prn = isl_printer_print_pw_aff(prn, index);
2025 isl_pw_aff_free(index);
2026 continue;
2029 domain = isl_set_copy(gen->stmt_domain);
2030 domain = isl_set_params(domain);
2031 if (!bounds) {
2032 index = isl_pw_aff_coalesce(index);
2033 index = isl_pw_aff_gist(index, domain);
2034 } else
2035 index = shift_index(index, array, &bounds[i], domain);
2037 if (i) {
2038 if (!bounds) {
2039 prn = isl_printer_print_str(prn, ") * (");
2040 prn = isl_printer_print_pw_aff(prn,
2041 array->local_bound[i]);
2042 prn = isl_printer_print_str(prn, ") + ");
2043 } else
2044 prn = isl_printer_print_str(prn, "][");
2046 prn = isl_printer_print_pw_aff(prn, index);
2047 isl_pw_aff_free(index);
2049 if (!name)
2050 prn = isl_printer_print_str(prn, ")");
2051 else
2052 prn = isl_printer_print_str(prn, "]");
2053 isl_printer_free(prn);
2055 isl_pw_multi_aff_free(pma);
2058 static struct cuda_stmt_access *print_expr(struct cuda_gen *gen, FILE *out,
2059 struct pet_expr *expr, struct cuda_stmt_access *access, int outer)
2061 int i;
2063 switch (expr->type) {
2064 case pet_expr_double:
2065 fprintf(out, "%g", expr->d);
2066 break;
2067 case pet_expr_access:
2068 print_access(gen, isl_map_copy(access->access), access->group);
2069 access = access->next;
2070 break;
2071 case pet_expr_unary:
2072 if (!outer)
2073 fprintf(out, "(");
2074 fprintf(out, " %s ", pet_op_str(expr->op));
2075 access = print_expr(gen, out, expr->args[pet_un_arg],
2076 access, 0);
2077 if (!outer)
2078 fprintf(out, ")");
2079 break;
2080 case pet_expr_binary:
2081 if (!outer)
2082 fprintf(out, "(");
2083 access = print_expr(gen, out, expr->args[pet_bin_lhs],
2084 access, 0);
2085 fprintf(out, " %s ", pet_op_str(expr->op));
2086 access = print_expr(gen, out, expr->args[pet_bin_rhs],
2087 access, 0);
2088 if (!outer)
2089 fprintf(out, ")");
2090 break;
2091 case pet_expr_ternary:
2092 if (!outer)
2093 fprintf(out, "(");
2094 access = print_expr(gen, out, expr->args[pet_ter_cond],
2095 access, 0);
2096 fprintf(out, " ? ");
2097 access = print_expr(gen, out, expr->args[pet_ter_true],
2098 access, 0);
2099 fprintf(out, " : ");
2100 access = print_expr(gen, out, expr->args[pet_ter_false],
2101 access, 0);
2102 if (!outer)
2103 fprintf(out, ")");
2104 break;
2105 case pet_expr_call:
2106 fprintf(out, "%s(", expr->name);
2107 for (i = 0; i < expr->n_arg; ++i) {
2108 if (i)
2109 fprintf(out, ", ");
2110 access = print_expr(gen, out, expr->args[i],
2111 access, 1);
2113 fprintf(out, ")");
2115 return access;
2118 static void print_stmt_body(struct cuda_gen *gen,
2119 FILE *out, struct cuda_stmt *stmt)
2121 print_expr(gen, out, stmt->body, stmt->accesses, 1);
2122 fprintf(out, ";\n");
2125 /* This function is called for each leaf in the innermost clast,
2126 * i.e., for each statement.
2127 * We print the statement body, simplifying the accesses based
2128 * on the schedule.
2130 static void print_statement(struct gpucode_info *code,
2131 struct clast_user_stmt *u)
2133 struct cuda_gen *gen = code->user;
2134 isl_space *dim;
2135 isl_set *par;
2136 isl_set *stmt_domain;
2137 isl_union_map *stmt_sched;
2138 isl_union_set *uset;
2139 int nr;
2140 struct cuda_stmt *stmt;
2142 nr = atoi(u->statement->name + 2);
2143 stmt = &gen->stmts[nr];
2145 stmt_domain = extract_host_domain(u);
2147 stmt_sched = isl_union_map_intersect_range(
2148 isl_union_map_copy(gen->local_sched),
2149 isl_union_set_from_set(extend(stmt_domain,
2150 gen->thread_tiled_len)));
2151 dim = isl_union_map_get_space(stmt_sched);
2152 par = parametrization(dim, gen->thread_tiled_len, 0,
2153 gen->thread_tiled_len, "c");
2154 stmt_sched = isl_union_map_intersect_range(stmt_sched,
2155 isl_union_set_from_set(par));
2157 uset = isl_union_map_domain(stmt_sched);
2158 dim = isl_union_set_get_space(uset);
2159 dim = isl_space_add_dims(dim, isl_dim_set,
2160 isl_set_dim(stmt->domain, isl_dim_set));
2161 dim = isl_space_set_tuple_name(dim, isl_dim_set, u->statement->name);
2162 gen->stmt_domain = isl_union_set_extract_set(uset, dim);
2163 isl_union_set_free(uset);
2165 print_indent(code->dst, code->indent);
2166 print_stmt_body(gen, code->dst, stmt);
2168 isl_set_free(gen->stmt_domain);
2171 static void print_private_access(struct cuda_gen *gen,
2172 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
2173 const char *type, struct cuda_array_ref_group *group)
2175 const char *array_name;
2176 char *name;
2177 isl_ctx *ctx;
2178 unsigned nvar = isl_set_dim(access, isl_dim_set);
2179 isl_union_map *usched;
2181 if (isl_set_fast_is_empty(access)) {
2182 isl_set_free(access);
2183 return;
2186 ctx = isl_set_get_ctx(access);
2187 array_name = isl_set_get_tuple_name(access);
2188 name = isl_alloc_array(ctx, char,
2189 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
2190 if (group->array->n_group > 1)
2191 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
2192 else
2193 sprintf(name, "%s_private_%s", type, array_name);
2194 access = isl_set_set_tuple_name(access, name);
2195 free(name);
2197 gen->copy_sched = shift_access(access, group);
2198 gen->copy_group = group;
2199 gen->copy_bound = group->private_bound;
2201 usched = isl_union_map_from_map(isl_map_copy(gen->copy_sched));
2202 print_shared_body(gen, shared_domain, usched, nvar,
2203 &print_copy_statement, 1);
2204 isl_union_map_free(usched);
2206 isl_map_free(gen->copy_sched);
2209 /* Print code for reading into or writing from private memory
2210 * the given array reference group.
2212 * sched maps the original iteration domains to the shared memory tile loops.
2214 static void print_group_private_accesses(struct cuda_gen *gen,
2215 struct cuda_array_ref_group *group,
2216 const char *type, __isl_keep isl_set *shared_domain,
2217 unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2219 int read;
2220 isl_union_map *access;
2221 isl_union_set *uset;
2222 isl_set *access_set;
2224 if (!group->private_bound)
2225 return;
2227 read = !strcmp(type, "read");
2229 access = group_access_relation(group, read, !read);
2230 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2231 access = isl_union_map_intersect(access,
2232 isl_union_map_copy(gen->private_access));
2233 uset = isl_union_map_range(access);
2235 if (isl_union_set_is_empty(uset)) {
2236 isl_union_set_free(uset);
2237 return;
2240 access_set = isl_set_from_union_set(uset);
2241 access_set = isl_set_coalesce(access_set);
2242 access_set = isl_set_eliminate(access_set, isl_dim_param,
2243 first_shared + shared_len,
2244 gen->shared_len - shared_len);
2246 print_private_access(gen, shared_domain, access_set, type, group);
2249 /* Print code for reading into or writing from private memory at
2250 * the given level (-1 for innermost).
2252 * If we are not printing at the innermost level, then the dimensionality
2253 * of shared_domain may be smaller than gen->shared_len.
2254 * As the rest of the code assumes that the domain of access has
2255 * gen->shared_len dimensions, we therefore may need to embed this domain
2256 * in a higher dimensional space after intersection with shared_domain.
2258 * This code is very similar to print_shared_accesses.
2259 * The main difference is that we to take into account gen->private_access.
2261 static void print_private_accesses(struct cuda_gen *gen,
2262 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2263 const char *type, int level)
2265 int i, j;
2266 isl_space *dim;
2267 isl_map *proj;
2268 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2269 unsigned first_shared;
2270 isl_union_map *sched;
2272 shared_domain = isl_set_copy(shared_domain);
2273 sched = isl_union_map_copy(gen->tiled_sched);
2274 dim = isl_union_map_get_space(sched);
2275 first_shared = isl_space_dim(dim, isl_dim_param);
2276 proj = projection(dim, gen->tiled_len, shared_len);
2277 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2278 sched = isl_union_map_intersect_range(sched,
2279 isl_union_set_from_set(isl_set_copy(shared_domain)));
2280 if (shared_len != gen->shared_len) {
2281 dim = isl_union_map_get_space(sched);
2282 proj = projection(dim, gen->shared_len, shared_len);
2283 proj = isl_map_reverse(proj);
2284 shared_domain = isl_set_apply(shared_domain,
2285 isl_map_copy(proj));
2286 sched = isl_union_map_apply_range(sched,
2287 isl_union_map_from_map(proj));
2290 for (i = 0; i < gen->n_array; ++i) {
2291 struct cuda_array_info *array = &gen->array[i];
2293 if (gen->array[i].print_shared_level != level)
2294 continue;
2296 for (j = 0; j < array->n_group; ++j)
2297 print_group_private_accesses(gen, array->groups[j],
2298 type, shared_domain,
2299 first_shared, shared_len, sched);
2302 isl_union_map_free(sched);
2303 isl_set_free(shared_domain);
2306 /* Set unroll[j] if the input dimension j is involved in
2307 * the index expression represented by bmap.
2309 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2311 int i, j;
2312 int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2313 int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2314 int *unroll = user;
2316 for (i = 0; i < n_out; ++i) {
2317 isl_constraint *c;
2318 int ok;
2320 ok = isl_basic_map_has_defining_equality(bmap,
2321 isl_dim_out, i, &c);
2322 assert(ok);
2323 for (j = 0; j < n_in; ++j)
2324 if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2325 unroll[j] = 1;
2326 isl_constraint_free(c);
2329 isl_basic_map_free(bmap);
2330 return 0;
2333 /* Given an array pos mapping input dimensions to the corresponding
2334 * output dimension, construct the corresponding map.
2336 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
2337 int *pos, int len)
2339 int i;
2340 isl_constraint *c;
2341 isl_basic_map *bmap;
2342 isl_local_space *ls;
2344 dim = isl_space_add_dims(dim, isl_dim_in, len);
2345 dim = isl_space_add_dims(dim, isl_dim_out, len);
2346 bmap = isl_basic_map_universe(isl_space_copy(dim));
2347 ls = isl_local_space_from_space(dim);
2349 for (i = 0; i < len; ++i) {
2350 c = isl_equality_alloc(isl_local_space_copy(ls));
2351 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2352 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2353 bmap = isl_basic_map_add_constraint(bmap, c);
2355 isl_local_space_free(ls);
2357 return isl_map_from_basic_map(bmap);
2360 /* Find all loops involved in any of the index expressions for any of
2361 * the private accesses, move them innermost and then mark them as
2362 * requiring unrolling by setting gen->first_unroll.
2363 * The loops involved should all be parallel because of the checks
2364 * we performed in check_private_group_access. Moving them innermost
2365 * is therefore a valid transformation.
2367 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2368 __isl_take isl_union_map *sched)
2370 int i, j;
2371 int unroll[gen->thread_tiled_len];
2372 int perm[gen->thread_tiled_len];
2373 isl_space *dim;
2374 isl_map *permute;
2375 int len = gen->shared_len + gen->n_parallel + gen->n_block;
2377 gen->first_unroll = -1;
2379 for (i = 0; i < gen->thread_tiled_len; ++i)
2380 unroll[i] = 0;
2381 for (i = 0; i < gen->n_array; ++i) {
2382 struct cuda_array_info *array = &gen->array[i];
2384 for (j = 0; j < array->n_group; ++j) {
2385 isl_union_map *access;
2386 isl_map *acc;
2388 if (!array->groups[j]->private_bound)
2389 continue;
2391 access = group_access_relation(array->groups[j], 1, 1);
2392 access = isl_union_map_apply_domain(access,
2393 isl_union_map_copy(sched));
2395 acc = isl_map_from_union_map(access);
2396 isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2398 isl_map_free(acc);
2402 for (i = 0; i < gen->shared_len; ++i)
2403 if (unroll[i])
2404 return sched;
2406 for (i = gen->shared_len; i < len; ++i)
2407 if (unroll[i])
2408 break;
2410 if (i >= len)
2411 return sched;
2413 for (i = len; i < gen->thread_tiled_len; ++i)
2414 if (unroll[i])
2415 return sched;
2417 j = 0;
2418 for (i = 0; i < gen->thread_tiled_len; ++i)
2419 if (!unroll[i])
2420 perm[i] = j++;
2421 gen->first_unroll = 1 + j;
2422 for (i = 0; i < len; ++i)
2423 if (unroll[i])
2424 perm[i] = j++;
2426 dim = isl_union_map_get_space(sched);
2427 permute = permutation(dim, perm, gen->thread_tiled_len);
2428 sched = isl_union_map_apply_range(sched,
2429 isl_union_map_from_map(permute));
2431 return sched;
2434 /* This function is called for each leaf in the clast of the kernel code.
2435 * We first specialize the schedule to the site of the leaf and
2436 * print code for reading into shared memory, performing the actual
2437 * computations and writing from shared memory, with the required
2438 * synchronizations.
2440 static void print_kernel_user(struct gpucode_info *code,
2441 struct clast_user_stmt *u)
2443 struct cuda_gen *gen = code->user;
2444 isl_set *shared_domain;
2446 shared_domain = extract_entire_host_domain(u);
2448 print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2450 print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2452 print_shared_body(gen, shared_domain, gen->local_sched,
2453 gen->thread_tiled_len, &print_statement,
2454 gen->first_unroll);
2456 print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2458 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2459 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2461 print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2463 isl_set_free(shared_domain);
2466 /* Check if we need to perform any copying to shared memory at this level
2467 * and if so, print the copying instructions.
2468 * Any array for which we are allowed to print copying instructions at
2469 * this level, but haven't done so already, is printed.
2471 static void print_kernel_for_head(struct gpucode_info *code,
2472 struct clast_for *f)
2474 int i;
2475 struct cuda_gen *gen = code->user;
2476 isl_set *domain;
2477 int level;
2478 int print = 0;
2480 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2481 level = isl_set_dim(domain, isl_dim_set) - 1;
2483 for (i = 0; i < gen->n_array; ++i) {
2484 if (gen->array[i].print_shared_level >= 0)
2485 continue;
2486 if (gen->array[i].last_shared > level)
2487 continue;
2488 gen->array[i].print_shared_level = level;
2489 print = 1;
2492 if (print) {
2493 print_shared_accesses(gen, domain, gen->read, "read", level);
2494 print_private_accesses(gen, domain, gen->read, "read", level);
2497 isl_set_free(domain);
2500 /* Print instructions for copying from shared memory for each array
2501 * for which print_kernel_for_head has added copying instructions
2502 * to shared memory.
2504 static void print_kernel_for_foot(struct gpucode_info *code,
2505 struct clast_for *f)
2507 int i;
2508 struct cuda_gen *gen = code->user;
2509 isl_set *domain;
2510 int level;
2511 int print = 0;
2513 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2514 level = isl_set_dim(domain, isl_dim_set) - 1;
2516 for (i = 0; i < gen->n_array; ++i) {
2517 if (gen->array[i].print_shared_level != level)
2518 continue;
2519 print = 1;
2520 break;
2523 if (print) {
2524 print_private_accesses(gen, domain, gen->write, "write", level);
2525 print_shared_accesses(gen, domain, gen->write, "write", level);
2528 isl_set_free(domain);
2531 /* Use CLooG to generate code for the outer gen->shared_first loops
2532 * of the local schedule "sched".
2533 * The pretty printing of this code is handled by gpu_print_host_stmt,
2534 * which calls print_kernel_user for each iteration of the shared tile loops.
2536 static void print_cloog_kernel_body(struct cuda_gen *gen,
2537 __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2539 int i;
2540 CloogOptions *options;
2541 CloogDomain *cloog_context;
2542 CloogUnionDomain *ud;
2543 CloogInput *input;
2544 struct clast_stmt *stmt;
2545 char name[20];
2547 sched = isl_union_map_copy(sched);
2548 sched = isl_union_map_align_params(sched, isl_set_get_space(context));
2550 options = cloog_options_malloc(gen->state);
2551 options->language = CLOOG_LANGUAGE_C;
2552 options->strides = 1;
2553 options->sh = 1;
2554 options->stop = gen->shared_len;
2555 options->f = gen->tiled_len;
2556 options->l = gen->tiled_len;
2557 options->save_domains = 1;
2558 options->noscalars = 1;
2560 ud = cloog_union_domain_from_isl_union_map(sched);
2561 for (i = 0; i < gen->shared_len; ++i) {
2562 snprintf(name, sizeof(name), "g%d", i);
2563 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2565 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2566 input = cloog_input_alloc(cloog_context, ud);
2568 stmt = cloog_clast_create_from_input(input, options);
2570 gen->kernel_code.indent = 4;
2571 gen->kernel_code.dst = gen->cuda.kernel_c;
2572 gen->kernel_code.print_user_stmt = NULL;
2573 gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2574 gen->kernel_code.print_for_head = &print_kernel_for_head;
2575 gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2576 gen->kernel_code.user = gen;
2577 gpu_print_host_stmt(&gen->kernel_code, stmt);
2579 cloog_clast_free(stmt);
2580 cloog_options_free(options);
2583 static void print_kernel_iterators(struct cuda_gen *gen)
2585 int i;
2586 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2587 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2588 "threadIdx.z" };
2590 if (gen->n_grid > 0) {
2591 print_indent(gen->cuda.kernel_c, 4);
2592 fprintf(gen->cuda.kernel_c, "int ");
2593 for (i = 0; i < gen->n_grid; ++i) {
2594 if (i)
2595 fprintf(gen->cuda.kernel_c, ", ");
2596 fprintf(gen->cuda.kernel_c, "b%d = %s",
2597 i, block_dims[gen->n_grid - 1 - i]);
2599 fprintf(gen->cuda.kernel_c, ";\n");
2602 if (gen->n_block > 0) {
2603 print_indent(gen->cuda.kernel_c, 4);
2604 fprintf(gen->cuda.kernel_c, "int ");
2605 for (i = 0; i < gen->n_block; ++i) {
2606 if (i)
2607 fprintf(gen->cuda.kernel_c, ", ");
2608 fprintf(gen->cuda.kernel_c, "t%d = %s",
2609 i, thread_dims[gen->n_block - 1 - i]);
2611 fprintf(gen->cuda.kernel_c, ";\n");
2615 static void print_group_shared_array(struct cuda_gen *gen,
2616 struct cuda_array_ref_group *group)
2618 int j;
2619 struct cuda_array_bound *bounds;
2621 bounds = group->private_bound;
2622 if (!bounds)
2623 bounds = group->shared_bound;
2624 if (!bounds)
2625 return;
2627 print_indent(gen->cuda.kernel_c, 4);
2628 fprintf(gen->cuda.kernel_c, "%s%s ",
2629 group->private_bound ? "" : "__shared__ ", group->array->type);
2630 print_array_name(gen->cuda.kernel_c, group);
2631 for (j = 0; j < group->array->n_index; ++j) {
2632 fprintf(gen->cuda.kernel_c, "[");
2633 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2634 fprintf(gen->cuda.kernel_c, "]");
2636 fprintf(gen->cuda.kernel_c, ";\n");
2639 static void print_shared_arrays(struct cuda_gen *gen)
2641 int i, j;
2643 for (i = 0; i < gen->n_array; ++i) {
2644 struct cuda_array_info *array = &gen->array[i];
2646 for (j = 0; j < array->n_group; ++j)
2647 print_group_shared_array(gen, array->groups[j]);
2651 static void print_kernel_body(struct cuda_gen *gen,
2652 __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2654 isl_set *context;
2656 context = isl_set_copy(host_domain);
2657 context = parametrize(context, 0, gen->tile_first, "h");
2658 context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2659 context = add_bounded_parameters(context,
2660 gen->n_grid, gen->grid_dim, "b");
2662 print_kernel_iterators(gen);
2663 print_shared_arrays(gen);
2665 fprintf(gen->cuda.kernel_c, "\n");
2667 print_cloog_kernel_body(gen, context, sched);
2669 isl_set_free(context);
2672 /* Given a constraint
2674 * a(p,i) + j = g f(e)
2676 * or -a(p,i) - j = g f(e) if sign < 0,
2677 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2678 * a(p,i) is assumed to be an expression in only the parameters.
2680 static void extract_stride(__isl_keep isl_constraint *c,
2681 struct cuda_array_bound *bound, isl_int stride, int sign)
2683 int i;
2684 isl_int v;
2685 isl_space *dim;
2686 unsigned nparam;
2687 isl_aff *aff;
2689 isl_int_set(bound->stride, stride);
2691 dim = isl_constraint_get_space(c);
2692 dim = isl_space_params(dim);
2694 nparam = isl_space_dim(dim, isl_dim_param);
2696 isl_int_init(v);
2698 isl_constraint_get_constant(c, &v);
2699 if (sign < 0)
2700 isl_int_neg(v, v);
2701 aff = isl_aff_zero_on_domain(isl_local_space_from_space(dim));
2702 aff = isl_aff_set_constant(aff, v);
2704 for (i = 0; i < nparam; ++i) {
2705 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2706 if (isl_int_is_zero(v))
2707 continue;
2708 if (sign < 0)
2709 isl_int_neg(v, v);
2710 aff = isl_aff_add_coefficient(aff, isl_dim_param, i, v);
2713 isl_int_clear(v);
2715 bound->shift = aff;
2718 /* Given an equality constraint of a map with a single output dimension j,
2719 * check if the constraint is of the form
2721 * a(p,i) + j = g f(e)
2723 * with a(p,i) an expression in the parameters and input dimensions
2724 * and f(e) an expression in the existentially quantified variables.
2725 * If so, and if g is larger than any such g from a previously considered
2726 * constraint, then call extract_stride. to record the stride information
2727 * in bound.
2729 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2731 int i;
2732 isl_int v, stride;
2733 unsigned n_div;
2734 struct cuda_array_bound *bound = user;
2736 isl_int_init(v);
2737 isl_int_init(stride);
2739 n_div = isl_constraint_dim(c, isl_dim_div);
2740 isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2742 if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2743 int s = isl_int_sgn(v);
2744 isl_int_set_si(stride, 0);
2745 for (i = 0; i < n_div; ++i) {
2746 isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2747 isl_int_gcd(stride, stride, v);
2749 if (!isl_int_is_zero(stride) &&
2750 isl_int_gt(stride, bound->stride))
2751 extract_stride(c, bound, stride, s);
2754 isl_int_clear(stride);
2755 isl_int_clear(v);
2757 isl_constraint_free(c);
2758 return 0;
2761 /* Given contraints on an array index i, check if we can find
2762 * a shift a(p) and a stride g such that
2764 * a(p) + i = 0 mod g
2766 * If so, record the information in bound and apply the mapping
2767 * i -> (i + a(p))/g to the array index in bounds and return
2768 * the new constraints.
2769 * If not, simply return the original constraints.
2771 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2772 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2774 isl_basic_map *aff;
2775 isl_basic_map *shift;
2776 isl_aff *aff_shift;
2778 isl_int_set_si(bound->stride, -1);
2780 aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2782 isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2784 isl_basic_map_free(aff);
2786 if (isl_int_is_neg(bound->stride))
2787 return bounds;
2789 aff_shift = isl_aff_copy(bound->shift);
2790 aff_shift = isl_aff_add_dims(aff_shift, isl_dim_in, 1);
2791 aff_shift = isl_aff_add_coefficient_si(aff_shift, isl_dim_in, 0, 1);
2792 aff_shift = isl_aff_scale_down(aff_shift, bound->stride);
2793 shift = isl_basic_map_from_aff(aff_shift);
2795 bound->shift_map = isl_basic_map_copy(shift);
2796 bounds = isl_basic_map_apply_range(bounds, shift);
2798 return bounds;
2801 struct cuda_size_info {
2802 isl_basic_set *bset;
2803 struct cuda_array_bound *bound;
2804 int pos;
2807 /* Given a constraint from the basic set describing the bounds on
2808 * an array index, check if it is a lower bound, say m i >= b(x), and,
2809 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2810 * upper bound. If so, and if this bound is smaller than any bound
2811 * derived from earlier constraints, set the size to this bound on
2812 * the expression and the lower bound to ceil(b(x)/m).
2814 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2816 struct cuda_size_info *size = user;
2817 unsigned nparam;
2818 unsigned n_div;
2819 isl_int v;
2821 nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2822 n_div = isl_constraint_dim(c, isl_dim_div);
2824 if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2825 isl_constraint_free(c);
2826 return 0;
2829 isl_int_init(v);
2831 isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2833 if (isl_int_is_pos(v)) {
2834 isl_aff *aff;
2835 isl_aff *lb;
2836 enum isl_lp_result res;
2838 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2839 aff = isl_aff_ceil(aff);
2841 lb = isl_aff_copy(aff);
2843 aff = isl_aff_neg(aff);
2844 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2846 res = isl_basic_set_max(size->bset, aff, &v);
2847 isl_aff_free(aff);
2849 if (res == isl_lp_ok) {
2850 isl_int_add_ui(v, v, 1);
2851 if (isl_int_is_neg(size->bound->size) ||
2852 isl_int_lt(v, size->bound->size)) {
2853 isl_int_set(size->bound->size, v);
2854 lb = isl_aff_drop_dims(lb, isl_dim_in,
2855 0, size->pos + 1);
2856 isl_aff_free(size->bound->lb);
2857 size->bound->lb = isl_aff_copy(lb);
2860 isl_aff_free(lb);
2863 isl_int_clear(v);
2864 isl_constraint_free(c);
2866 return 0;
2869 /* Given a basic map "bounds" that maps parameters and input dimensions
2870 * to a single output dimension, look for an expression in the parameters
2871 * and input dimensions such that the range of the output dimension shifted
2872 * by this expression is a constant.
2874 * In particular, we currently only consider lower bounds on the output
2875 * dimension as candidate expressions.
2877 static int compute_array_dim_size(struct cuda_gen *gen,
2878 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2880 struct cuda_size_info size;
2882 bounds = isl_basic_map_detect_equalities(bounds);
2883 bounds = check_stride(gen, bound, bounds);
2885 isl_int_set_si(bound->size, -1);
2886 bound->lb = NULL;
2888 size.bound = bound;
2889 size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2890 size.bset = isl_basic_map_wrap(bounds);
2891 size.bset = isl_basic_set_flatten(size.bset);
2892 size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2893 isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2894 &size);
2895 isl_basic_set_free(size.bset);
2897 return isl_int_is_nonneg(bound->size) ? 0 : -1;
2900 /* Check if we can find a shared memory tile for the given array
2901 * based on the given accesses, and if so, put the results
2902 * in array->shared_bound.
2904 * We project the accesses on each index in turn and look for a parametric
2905 * offset such that the size is constant.
2907 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2908 struct cuda_array_info *array, __isl_keep isl_map *access,
2909 struct cuda_array_bound *bounds)
2911 int i;
2913 for (i = 0; i < array->n_index; ++i) {
2914 isl_map *access_i;
2915 isl_basic_map *hull;
2917 access_i = isl_map_copy(access);
2918 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2919 access_i = isl_map_project_out(access_i, isl_dim_out,
2920 1, array->n_index - (i + 1));
2921 access_i = isl_map_compute_divs(access_i);
2922 hull = isl_map_simple_hull(access_i);
2923 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2924 return 0;
2927 return 1;
2930 /* Construct a map with input the shared tile loops and the loops that
2931 * will be wrapped around the threads that relates these later loops
2932 * to the thread indices and the projects them out.
2934 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2936 isl_map *priv;
2937 isl_map *tiling;
2938 isl_map *proj;
2939 isl_set *par;
2940 isl_space *dim;
2942 dim = isl_union_map_get_space(gen->shared_sched);
2944 if (gen->options->wrap)
2945 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2946 gen->shared_len, gen->n_block, gen->block_dim);
2947 else
2948 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2949 gen->shared_len, gen->n_block, gen->block_dim);
2951 priv = tiling;
2953 par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2954 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2955 gen->n_block, "t");
2957 priv = isl_map_align_params(priv, isl_set_get_space(par));
2958 priv = isl_map_intersect_range(priv, par);
2960 dim = isl_map_get_space(priv);
2961 dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2962 dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2963 proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2964 gen->shared_len);
2966 priv = isl_map_apply_range(priv, proj);
2968 return priv;
2971 /* Construct a map from domain_dim to domain_dim that increments
2972 * the dimension at position "pos" and leaves all other dimensions
2973 * constant.
2975 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2977 int i;
2978 int len = isl_space_dim(domain_dim, isl_dim_set);
2979 isl_space *dim;
2980 isl_basic_map *next;
2981 isl_local_space *ls;
2983 dim = isl_space_map_from_set(domain_dim);
2984 next = isl_basic_map_universe(isl_space_copy(dim));
2985 ls = isl_local_space_from_space(dim);
2987 for (i = 0; i < len; ++i) {
2988 isl_constraint *c;
2990 c = isl_equality_alloc(isl_local_space_copy(ls));
2991 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2992 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2993 if (i == pos)
2994 isl_constraint_set_constant_si(c, 1);
2995 next = isl_basic_map_add_constraint(next, c);
2998 isl_local_space_free(ls);
3000 return isl_map_from_basic_map(next);
3003 /* Check if the given access is coalesced.
3004 * That is, check whether incrementing the dimension that will get
3005 * wrapped over the last thread index results in incrementing
3006 * the last array index.
3008 * This function is only called for access relations without reuse.
3010 static int access_is_coalesced(struct cuda_gen *gen,
3011 __isl_keep isl_union_map *access)
3013 isl_space *dim;
3014 isl_map *access_map;
3015 isl_map *next_thread_x;
3016 isl_map *next_element;
3017 isl_map *map;
3018 int coalesced;
3020 access = isl_union_map_copy(access);
3021 access = isl_union_map_apply_domain(access,
3022 isl_union_map_copy(gen->tiled_sched));
3023 access_map = isl_map_from_union_map(access);
3025 dim = isl_map_get_space(access_map);
3026 dim = isl_space_domain(dim);
3027 next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
3029 dim = isl_map_get_space(access_map);
3030 dim = isl_space_range(dim);
3031 next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
3033 map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
3034 map = isl_map_apply_range(map, access_map);
3036 coalesced = isl_map_is_subset(map, next_element);
3038 isl_map_free(next_element);
3039 isl_map_free(map);
3041 return coalesced;
3044 /* For the given array reference group, check whether the access is private
3045 * to the thread. That is, check that any given array element
3046 * is only accessed by a single thread.
3047 * We compute an access relation that maps the shared tile loop iterators
3048 * and the shared point loop iterators that will be wrapped over the
3049 * threads to the array elements.
3050 * We actually check that those iterators that will be wrapped
3051 * partition the array space. This check is stricter than necessary
3052 * since several iterations may be mapped onto the same thread
3053 * and then they could be allowed to access the same memory elements,
3054 * but our check does not allow this situation.
3056 * We also check that the index expression only depends on parallel
3057 * loops. That way, we can move those loops innermost and unroll them.
3058 * Again, we use a test that is stricter than necessary.
3059 * We actually check whether the index expression only depends
3060 * on the iterators that are wrapped over the threads.
3061 * These are necessarily parallel, but there may be more parallel loops.
3063 * Combining the injectivity of the first test with the single-valuedness
3064 * of the second test, we simply test for bijectivity.
3066 * If it turns out we can use registers, we compute the private memory
3067 * tile size using can_tile_for_shared_memory, after introducing a dependence
3068 * on the thread indices.
3070 * Before performing any of the above computations, we first check
3071 * if there is any reuse on the reference group. If not, we simply
3072 * return. If, moreover, the access is coalesced then we also remove
3073 * the shared memory tiling since we should just use global memory instead.
3075 static void check_private_group_access(struct cuda_gen *gen,
3076 struct cuda_array_ref_group *group)
3078 isl_map *acc;
3079 isl_union_map *access;
3080 int n_index = group->array->n_index;
3082 access = group_access_relation(group, 1, 1);
3083 if (isl_union_map_is_injective(access)) {
3084 if (group->shared_bound && access_is_coalesced(gen, access)) {
3085 free_bound_list(group->shared_bound, n_index);
3086 group->shared_bound = NULL;
3088 isl_union_map_free(access);
3089 return;
3091 access = isl_union_map_apply_domain(access,
3092 isl_union_map_copy(gen->shared_sched));
3094 acc = isl_map_from_union_map(access);
3096 if (!isl_map_is_bijective(acc)) {
3097 isl_map_free(acc);
3098 return;
3101 group->private_bound = create_bound_list(gen->ctx, n_index);
3102 acc = isl_map_align_params(acc, isl_map_get_space(gen->privatization));
3103 acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
3104 if (!can_tile_for_shared_memory(gen, group->array, acc,
3105 group->private_bound)) {
3106 free_bound_list(group->private_bound, n_index);
3107 group->private_bound = NULL;
3110 isl_map_free(acc);
3113 /* Look for the last shared tile loop that affects the offset of the
3114 * shared or private tile and store the result in array->last_shared.
3116 static void set_last_shared(struct cuda_gen *gen,
3117 struct cuda_array_ref_group *group)
3119 int i, j;
3120 struct cuda_array_bound *bounds;
3121 unsigned first_shared = gen->first_shared;
3122 int n_index = group->array->n_index;
3124 bounds = group->private_bound;
3125 if (!bounds)
3126 bounds = group->shared_bound;
3127 if (!bounds)
3128 return;
3130 for (j = gen->shared_len - 1; j >= 0; --j) {
3131 for (i = 0; i < n_index; ++i) {
3132 isl_aff *lb;
3133 isl_aff *shift;
3135 lb = bounds[i].lb;
3136 if (isl_aff_involves_dims(lb, isl_dim_param,
3137 first_shared + j, 1))
3138 break;
3140 shift = bounds[i].shift;
3141 if (!shift)
3142 continue;
3143 if (isl_aff_involves_dims(shift, isl_dim_param,
3144 first_shared + j, 1))
3145 break;
3147 if (i < n_index)
3148 break;
3150 group->array->last_shared = j;
3153 /* Compute the sizes of all private arrays for the current kernel,
3154 * as well as the offsets of the private pieces in the original arrays.
3155 * If we cannot or don't want to privatize a given array group,
3156 * we use the shared memory tile sizes computed in
3157 * compute_group_shared_bound instead.
3159 * If a given Array only has a single reference group and if we have
3160 * been able to find a privated or shared tile,
3161 * we also look for the last shared tile loop that affects the offset
3162 * (and therefore the array tile) and store the result in array->last_shared.
3164 * A privatized copy of all access relations from reference groups that
3165 * are mapped to private memory is stored in gen->privatization.
3167 static void compute_private_size(struct cuda_gen *gen)
3169 int i, j;
3170 isl_union_map *private;
3172 if (!gen->options->use_private_memory)
3173 return;
3175 private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
3177 for (i = 0; i < gen->n_array; ++i) {
3178 struct cuda_array_info *array = &gen->array[i];
3180 for (j = 0; j < array->n_group; ++j) {
3181 check_private_group_access(gen, array->groups[j]);
3183 if (!array->groups[j]->private_bound)
3184 continue;
3186 private = isl_union_map_union(private,
3187 group_access_relation(array->groups[j], 1, 1));
3190 array->last_shared = gen->shared_len - 1;
3191 array->print_shared_level = -1;
3193 if (array->n_group != 1)
3194 continue;
3195 set_last_shared(gen, array->groups[0]);
3198 if (isl_union_map_is_empty(private))
3199 isl_union_map_free(private);
3200 else {
3201 isl_union_map *priv;
3203 private = isl_union_map_apply_domain(private,
3204 isl_union_map_copy(gen->shared_sched));
3205 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3206 private = isl_union_map_apply_domain(private, priv);
3207 gen->private_access = private;
3211 /* Fill up the groups array with singleton groups, i.e., one group
3212 * per reference, initializing the array, access, write and refs fields.
3213 * In particular the access field is initialized to the scheduled
3214 * access relation of the array reference.
3216 * Return the number of elements initialized, i.e., the number of
3217 * active references in the current kernel.
3219 static int populate_array_references(struct cuda_gen *gen,
3220 struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3221 struct cuda_array_ref_group **groups)
3223 int i;
3224 int n;
3225 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3227 n = 0;
3228 for (i = 0; i < array->n_ref; ++i) {
3229 isl_union_map *umap;
3230 isl_map *map;
3231 struct cuda_array_ref_group *group;
3232 struct cuda_stmt_access *access = array->refs[i];
3234 map = isl_map_copy(access->access);
3235 umap = isl_union_map_from_map(map);
3236 umap = isl_union_map_apply_domain(umap,
3237 isl_union_map_copy(sched));
3239 if (isl_union_map_is_empty(umap)) {
3240 isl_union_map_free(umap);
3241 continue;
3244 map = isl_map_from_union_map(umap);
3246 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3247 assert(group);
3248 group->array = array;
3249 group->access = map;
3250 group->write = access->write;
3251 group->refs = &array->refs[i];
3253 groups[n++] = group;
3256 return n;
3259 static void free_array_ref_group(struct cuda_array_ref_group *group,
3260 int n_index)
3262 if (!group)
3263 return;
3264 free_bound_list(group->shared_bound, n_index);
3265 free_bound_list(group->private_bound, n_index);
3266 isl_map_free(group->access);
3267 free(group->refs);
3268 free(group);
3271 /* If two groups have overlapping access relations and if one of them
3272 * involves a write, then merge the two groups into one.
3274 * We keep track of the grouping in "leader". leader[j] points to
3275 * an earlier group array element that belongs to the same group,
3276 * or the array element j itself if this element is the first in the group.
3278 * Return the number of group leaders.
3280 static int group_overlapping_writes(int n,
3281 struct cuda_array_ref_group **groups, int *leader)
3283 int i, j;
3284 int n_group = n;
3286 for (i = 0; i < n; ++i) {
3287 int l = i;
3288 groups[l]->n_ref = 1;
3289 for (j = i - 1; j >= 0; --j) {
3290 isl_map *map;
3291 int empty;
3293 if (leader[j] != j)
3294 continue;
3295 if (!groups[l]->write && !groups[j]->write)
3296 continue;
3298 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3299 isl_map_copy(groups[j]->access));
3300 empty = isl_map_is_empty(map);
3301 isl_map_free(map);
3303 if (empty)
3304 continue;
3306 groups[j]->access = isl_map_union(groups[j]->access,
3307 groups[l]->access);
3308 groups[j]->write = 1;
3309 groups[l]->access = NULL;
3310 groups[j]->n_ref += groups[l]->n_ref;
3311 l = leader[l] = j;
3312 n_group--;
3314 leader[i] = l;
3317 return n_group;
3320 /* Compute the size of the shared array corresponding to the given array
3321 * array refrence group, based on the accesses from the current kernel,
3322 * as well as the offset of the shared piece in the original array.
3324 static void compute_group_shared_bound(struct cuda_gen *gen,
3325 struct cuda_array_info *array, struct cuda_array_ref_group *group)
3327 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3329 if (!gen->options->use_shared_memory)
3330 return;
3331 if (cuda_array_is_read_only_scalar(array))
3332 return;
3334 group->shared_bound = create_bound_list(ctx, array->n_index);
3335 if (!can_tile_for_shared_memory(gen, array, group->access,
3336 group->shared_bound)) {
3337 free_bound_list(group->shared_bound, array->n_index);
3338 group->shared_bound = NULL;
3342 /* Given an initial grouping of array references and shared memory tiles
3343 * for each group that allows for a shared memory tile, merge two groups
3344 * if both have a shared memory tile and if the merged group also has
3345 * a shared memory tile.
3347 * Return the number of group leaders after merging.
3349 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3350 struct cuda_array_info *array, int n,
3351 struct cuda_array_ref_group **groups, int *leader, int n_group)
3353 int i, j;
3354 isl_ctx *ctx = isl_space_get_ctx(array->dim);
3356 for (i = 0; n_group > 1 && i < n; ++i) {
3357 int l = i;
3358 if (leader[i] != i)
3359 continue;
3360 if (!groups[i]->shared_bound)
3361 continue;
3362 for (j = i - 1; j >= 0; --j) {
3363 isl_map *map;
3364 int empty;
3365 struct cuda_array_bound *shared_bound;
3367 if (leader[j] != j)
3368 continue;
3369 if (!groups[j]->shared_bound)
3370 continue;
3372 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3373 isl_map_copy(groups[j]->access));
3374 empty = isl_map_is_empty(map);
3375 isl_map_free(map);
3377 if (empty)
3378 continue;
3380 map = isl_map_union(isl_map_copy(groups[l]->access),
3381 isl_map_copy(groups[j]->access));
3382 shared_bound = create_bound_list(ctx, array->n_index);
3383 if (!can_tile_for_shared_memory(gen, array, map,
3384 shared_bound)) {
3385 isl_map_free(map);
3386 free_bound_list(shared_bound, array->n_index);
3387 continue;
3390 free_bound_list(groups[j]->shared_bound,
3391 array->n_index);
3392 groups[j]->shared_bound = shared_bound;
3393 isl_map_free(groups[j]->access);
3394 groups[j]->access = map;
3395 groups[j]->n_ref += groups[l]->n_ref;
3396 l = leader[l] = j;
3397 n_group--;
3401 return n_group;
3404 /* Extract an array of array reference groups from the array of references
3405 * and the grouping information in "leader".
3407 * Store the results in array->n_group and array->groups.
3409 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3410 int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3412 int i, j;
3414 for (i = 2; i < n; ++i)
3415 leader[i] = leader[leader[i]];
3417 array->n_group = n_group;
3418 array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3419 n_group);
3420 assert(array->groups);
3422 j = 0;
3423 for (i = 0; i < n; ++i) {
3424 int k, l;
3425 struct cuda_stmt_access **refs;
3427 if (leader[i] != i) {
3428 groups[i]->refs = NULL;
3429 free_array_ref_group(groups[i], array->n_index);
3430 continue;
3433 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3434 groups[i]->n_ref);
3435 assert(refs);
3436 l = 0;
3437 for (k = i; k < n; ++k)
3438 if (leader[k] == i) {
3439 refs[l++] = *groups[k]->refs;
3440 (*groups[k]->refs)->group = j;
3443 groups[i]->refs = refs;
3444 groups[i]->nr = j;
3445 array->groups[j++] = groups[i];
3449 /* Group array references that should be considered together when
3450 * deciding whether to access them from private, shared or global memory.
3452 * In particular, if two array references overlap and if one of them
3453 * is a write, then the two references are grouped together.
3454 * Furthermore, if two groups admit a shared memory tile and if the
3455 * combination of the two also admits a shared memory tile, we merge
3456 * the two groups.
3458 * During the construction the group->refs field points to a single
3459 * array reference inside the array of array references, while
3460 * group->n_ref contains the number of element in leader that
3461 * (directly or indirectly) point to this group, provided the group
3462 * is a leader.
3464 static void group_array_references(struct cuda_gen *gen,
3465 struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3467 int i;
3468 int n, n_group;
3469 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3470 struct cuda_array_ref_group **groups;
3471 int *leader;
3473 groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3474 array->n_ref);
3475 assert(groups);
3477 n = populate_array_references(gen, array, sched, groups);
3479 leader = isl_alloc_array(ctx, int, n);
3480 assert(leader);
3482 n_group = group_overlapping_writes(n, groups, leader);
3484 for (i = 0; i < n; ++i)
3485 if (leader[i] == i)
3486 compute_group_shared_bound(gen, array, groups[i]);
3488 n_group = group_common_shared_memory_tile(gen, array, n, groups,
3489 leader, n_group);
3491 extract_array_groups(ctx, array, n, groups, leader, n_group);
3493 free(leader);
3494 free(groups);
3497 /* Take tiled_sched, project it onto the shared tile loops and
3498 * the loops that will be wrapped over the threads,
3499 * parametrize the shared tile loops and store the result in gen->shared_sched.
3500 * The position of the first of these parameters is stored in gen->first_shared.
3501 * Also compute a projection that projects out the loops that will be
3502 * wrapped over the threads and store this projection in gen->shared_proj.
3504 static void compute_shared_sched(struct cuda_gen *gen)
3506 isl_space *dim;
3507 isl_map *proj;
3508 isl_set *par;
3509 isl_union_map *sched;
3511 sched = isl_union_map_copy(gen->tiled_sched);
3513 dim = isl_union_map_get_space(sched);
3514 gen->first_shared = isl_space_dim(dim, isl_dim_param);
3515 proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3516 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3518 dim = isl_union_map_get_space(sched);
3519 par = parametrization(dim, gen->shared_len + gen->n_block,
3520 0, gen->shared_len, "g");
3521 sched = isl_union_map_intersect_range(sched,
3522 isl_union_set_from_set(par));
3524 dim = isl_union_map_get_space(sched);
3525 proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3527 gen->shared_sched = sched;
3528 gen->shared_proj = isl_union_map_from_map(proj);
3531 /* Group references of all arrays in the program.
3533 static void group_references(struct cuda_gen *gen)
3535 int i;
3536 isl_union_map *sched;
3538 sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3539 isl_union_map_copy(gen->shared_proj));
3541 for (i = 0; i < gen->n_array; ++i)
3542 group_array_references(gen, &gen->array[i], sched);
3544 isl_union_map_free(sched);
3547 /* Free all array information that is local to the current kernel.
3549 static void free_local_array_info(struct cuda_gen *gen)
3551 int i, j;
3553 for (i = 0; i < gen->n_array; ++i) {
3554 struct cuda_array_info *array = &gen->array[i];
3556 for (j = 0; j < array->n_group; ++j)
3557 free_array_ref_group(array->groups[j], array->n_index);
3558 free(array->groups);
3560 if (array->n_group == 0)
3561 continue;
3562 for (j = 0; j < gen->array[i].n_index; ++j) {
3563 isl_pw_aff_free(gen->array[i].local_bound[j]);
3564 gen->array[i].local_bound[j] = NULL;
3569 static void print_iterator_list(FILE *out, int len, const char *prefix,
3570 int parens)
3572 int i;
3574 fprintf(out, "(");
3575 for (i = 0; i < len; ++i) {
3576 if (i)
3577 fprintf(out, ", ");
3578 if (parens)
3579 fprintf(out, "(%s%d)", prefix, i);
3580 else
3581 fprintf(out, "%s%d", prefix, i);
3583 fprintf(out, ")");
3586 /* The sizes of the arrays on the host that have been computed by
3587 * extract_array_info may depend on the parameters. Use the extra
3588 * constraints on the parameters that are valid at "host_domain"
3589 * to simplify these expressions.
3591 static void localize_bounds(struct cuda_gen *gen,
3592 __isl_keep isl_set *host_domain)
3594 int i, j;
3595 isl_set *context;
3597 context = isl_set_copy(host_domain);
3598 context = isl_set_params(host_domain);
3600 for (i = 0; i < gen->n_array; ++i) {
3601 struct cuda_array_info *array = &gen->array[i];
3603 if (array->n_group == 0)
3604 continue;
3606 for (j = 0; j < array->n_index; ++j) {
3607 isl_pw_aff *pwaff;
3609 pwaff = isl_pw_aff_copy(array->bound[j]);
3610 pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3611 array->local_bound[j] = pwaff;
3614 isl_set_free(context);
3617 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3618 * in the statement list u.
3619 * Because of the way the schedule is constructed, the other statements
3620 * in the list, if any, should have the same values for these properties.
3622 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3624 int nr;
3625 struct cuda_stmt *stmt;
3627 nr = atoi(u->statement->name + 2);
3628 stmt = &gen->stmts[nr];
3630 gen->tile_len = stmt->tile_len;
3631 gen->n_parallel = stmt->n_parallel;
3634 /* This function is called for each leaf in the clast of the host code.
3635 * We first specialize the schedule to the site of the leaf, compute
3636 * the size of shared memory and then print the body of host code
3637 * and the associated kernel (through a call to print_kernel_body).
3639 static void print_host_user(struct gpucode_info *code,
3640 struct clast_user_stmt *u)
3642 struct cuda_gen *gen = code->user;
3643 isl_space *dim;
3644 isl_set *par;
3645 isl_set *host_domain;
3646 isl_union_map *access;
3647 isl_union_map *local_sched;
3648 isl_union_set *arrays;
3650 set_tile_len(gen, u);
3651 read_sizes(gen);
3653 host_domain = extract_entire_host_domain(u);
3655 local_sched = isl_union_map_intersect_range(
3656 isl_union_map_copy(gen->sched),
3657 isl_union_set_from_set(extend(isl_set_copy(host_domain),
3658 gen->untiled_len)));
3659 access = isl_union_map_union(isl_union_map_copy(gen->read),
3660 isl_union_map_copy(gen->write));
3661 access = isl_union_map_apply_domain(access,
3662 isl_union_map_copy(local_sched));
3663 arrays = isl_union_map_range(access);
3665 print_indent(code->dst, code->indent);
3666 fprintf(code->dst, "dim3 k%d_dimBlock", gen->kernel_id);
3667 print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3668 fprintf(code->dst, ";\n");
3670 print_indent(code->dst, code->indent);
3671 fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
3672 print_reverse_list(code->dst, gen->n_grid, gen->grid_dim);
3673 fprintf(code->dst, ";\n");
3675 gen->tiled_sched = tile_schedule(gen, local_sched);
3676 gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3677 gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3679 gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3681 dim = isl_union_map_get_space(gen->local_sched);
3682 par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3683 gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3684 isl_union_set_from_set(par));
3686 gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3687 gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3689 gen->private_access = NULL;
3690 compute_shared_sched(gen);
3691 gen->privatization = compute_privatization(gen);
3692 group_references(gen);
3693 compute_private_size(gen);
3694 localize_bounds(gen, host_domain);
3696 gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3698 print_kernel_launch(gen, arrays);
3700 fprintf(gen->cuda.kernel_c, "{\n");
3702 print_kernel_body(gen, host_domain, gen->tiled_sched);
3704 fprintf(gen->cuda.kernel_c, "}\n");
3706 free_local_array_info(gen);
3707 isl_map_free(gen->privatization);
3708 isl_union_map_free(gen->private_access);
3709 isl_union_map_free(gen->local_sched);
3710 isl_union_map_free(gen->tiled_sched);
3711 isl_union_map_free(gen->shared_sched);
3712 isl_union_map_free(gen->shared_proj);
3713 isl_union_set_free(arrays);
3714 isl_set_free(host_domain);
3716 free(gen->tile_size);
3717 gen->kernel_id++;
3720 /* Use CLooG to generate code for the outer gen->tile_first loops
3721 * of the global schedule in gen->sched.
3722 * The pretty printing of this code is handled by gpu_print_host_stmt,
3723 * which calls print_host_user for each kernel invocation location.
3725 static void print_cloog_host_code(struct cuda_gen *gen)
3727 int i;
3728 isl_set *context;
3729 isl_union_map *sched;
3730 CloogOptions *options;
3731 CloogDomain *cloog_context;
3732 CloogUnionDomain *ud;
3733 CloogInput *input;
3734 struct clast_stmt *stmt;
3735 char name[20];
3737 options = cloog_options_malloc(gen->state);
3738 options->language = CLOOG_LANGUAGE_C;
3739 options->otl = 0;
3740 options->strides = 1;
3741 options->stop = gen->tile_first;
3742 options->f = gen->untiled_len;
3743 options->l = gen->untiled_len;
3744 options->save_domains = 1;
3745 options->noscalars = 1;
3747 sched = isl_union_map_copy(gen->sched);
3748 ud = cloog_union_domain_from_isl_union_map(sched);
3749 for (i = 0; i < options->stop; ++i) {
3750 snprintf(name, sizeof(name), "h%d", i);
3751 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3753 context = isl_set_copy(gen->context);
3754 cloog_context = cloog_domain_from_isl_set(context);
3755 input = cloog_input_alloc(cloog_context, ud);
3757 stmt = cloog_clast_create_from_input(input, options);
3759 gen->code.indent = 0;
3760 gen->code.dst = gen->cuda.host_c;
3761 gen->code.print_user_stmt = NULL;
3762 gen->code.print_user_stmt_list = &print_host_user;
3763 gen->code.print_for_head = NULL;
3764 gen->code.print_for_foot = NULL;
3765 gen->code.user = gen;
3766 gpu_print_host_stmt(&gen->code, stmt);
3768 cloog_clast_free(stmt);
3769 cloog_options_free(options);
3770 fprintf(gen->cuda.host_c, "\n");
3773 void print_cuda_macros(struct cuda_gen *gen)
3775 const char *macros =
3776 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
3777 "#define cudaCheckKernel()"
3778 " assert(cudaGetLastError() == cudaSuccess)\n\n";
3779 fputs(macros, gen->cuda.host_c);
3782 void print_host_code(struct cuda_gen *gen)
3784 fprintf(gen->cuda.host_c, "{\n");
3785 print_cloog_macros(gen->cuda.host_c);
3786 print_cloog_macros(gen->cuda.kernel_c);
3788 print_cuda_macros(gen);
3790 declare_device_arrays(gen);
3792 allocate_device_arrays(gen);
3793 copy_arrays_to_device(gen);
3795 gen->kernel_id = 0;
3796 print_cloog_host_code(gen);
3798 copy_arrays_from_device(gen);
3799 free_device_arrays(gen);
3801 fprintf(gen->cuda.host_c, "}\n");
3804 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
3805 const char *str)
3807 isl_ctx *ctx;
3808 isl_set *context;
3810 if (!str)
3811 return set;
3813 ctx = isl_set_get_ctx(set);
3814 context = isl_set_read_from_str(ctx, str);
3815 context = isl_set_align_params(context, isl_set_get_space(set));
3816 set = isl_set_intersect(set, context);
3818 return set;
3821 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
3823 if (!str)
3824 return NULL;
3825 return isl_union_map_read_from_str(ctx, str);
3828 /* Return the union of all iteration domains of the gen->stmts[i].
3830 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
3832 int i;
3833 isl_union_set *domain;
3835 domain = isl_union_set_empty(isl_set_get_space(gen->context));
3836 for (i = 0; i < gen->n_stmts; ++i) {
3837 isl_set *domain_i;
3839 domain_i = isl_set_copy(gen->stmts[i].domain);
3840 domain = isl_union_set_union(domain,
3841 isl_union_set_from_set(domain_i));
3844 return domain;
3847 /* Information about the outermost tilable bands in the forest of bands.
3849 * tile_len and n_parallel are only sets on band_info structures
3850 * that correspond to outermost bands. For other bands (in particular,
3851 * ancestors of the outermost bands), n_parallal is set to 0.
3853 * prefix is the (padded) schedule leading up to the outermost tilable bands.
3855 * tile_first is the number of schedule dimensions in prefix.
3857 * suffix is the schedule of the outermost tilable bands and their descendants.
3859 struct band_info {
3860 struct cuda_gen *gen;
3861 int tile_first;
3862 int tile_len;
3863 int n_parallel;
3864 isl_union_map *prefix;
3865 isl_union_map *suffix;
3868 /* Set tile_len and n_parallel of the statement to that of
3869 * their outermost band, recorded in the band_info.
3871 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
3873 struct band_info *info = user;
3874 int nr;
3875 struct cuda_stmt *stmt;
3877 nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
3878 stmt = &info->gen->stmts[nr];
3880 stmt->tile_len = info->tile_len;
3881 stmt->n_parallel = info->n_parallel;
3883 isl_map_free(map);
3885 return 0;
3888 static void list_select_outer_band(struct cuda_gen *gen,
3889 __isl_take isl_band_list *list, int pos, struct band_info *list_info);
3891 /* Check if this band has any parallel loops. If so, take it as
3892 * the outermost tilable band. If not, continue looking for the
3893 * outermost tilable band in the children of the current band.
3895 static void band_select_outer_band(struct cuda_gen *gen,
3896 __isl_take isl_band *band, int pos, struct band_info *info)
3898 int n = isl_band_n_member(band);
3899 int n_parallel;
3901 for (n_parallel = 0; n_parallel < n; ++n_parallel)
3902 if (!isl_band_member_is_zero_distance(band, n_parallel))
3903 break;
3905 info->n_parallel = n_parallel;
3906 if (n_parallel) {
3907 info->gen = gen;
3908 info->tile_first = pos;
3909 info->tile_len = n;
3910 info->prefix = isl_band_get_prefix_schedule(band);
3911 info->suffix = isl_union_map_flat_range_product(
3912 isl_band_get_partial_schedule(band),
3913 isl_band_get_suffix_schedule(band));
3914 isl_union_map_foreach_map(info->prefix,
3915 &set_stmt_tile_len, info);
3916 } else if (isl_band_has_children(band)) {
3917 isl_band_list *children;
3918 children = isl_band_get_children(band);
3919 list_select_outer_band(gen, children, pos + n, info);
3920 } else {
3921 info->gen = gen;
3922 info->tile_first = pos + n;
3923 info->tile_len = 0;
3924 info->prefix = isl_union_map_flat_range_product(
3925 isl_band_get_prefix_schedule(band),
3926 isl_band_get_partial_schedule(band));
3927 info->suffix = isl_band_get_suffix_schedule(band);
3928 isl_union_map_foreach_map(info->prefix,
3929 &set_stmt_tile_len, info);
3932 isl_band_free(band);
3935 /* Comparison function that returns a non-zero value for band_infos
3936 * with different tile_len fields or different n_parallel fields.
3938 static int cmp_band(const void *p1, const void *p2)
3940 const struct band_info *info1 = p1;
3941 const struct band_info *info2 = p2;
3943 if (info1->tile_len != info2->tile_len)
3944 return info1->tile_len - info2->tile_len;
3946 return info1->n_parallel - info2->n_parallel;
3949 /* Extend "umap" with coordinates with fixed value "val"
3950 * to a total length of "dst_len", assuming the original dimension is "src_len".
3952 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
3953 int src_len, int dst_len, int val)
3955 isl_space *dim;
3956 isl_map *map;
3957 int i;
3959 dim = isl_union_map_get_space(umap);
3960 map = isl_map_reverse(projection(dim, dst_len, src_len));
3961 for (i = src_len; i < dst_len; ++i)
3962 map = isl_map_fix_si(map, isl_dim_out, i, val);
3964 umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
3966 return umap;
3969 /* Group bands with the same values for tile_len and n_parallel.
3970 * The prefix schedule is then extended with a fixed coordinate that
3971 * is different for each such group.
3972 * Note that the actual values for this coordinate are not important.
3973 * The bands have already been effectively separated at a higher level
3974 * or they are independent and may be executed in parallel.
3975 * The list of band_info has been sorted before this functions is called.
3977 static void separate_bands(struct band_info *info, int n)
3979 int i;
3980 int j = 0;
3982 for (i = 0; i < n; ++i) {
3983 int l = info[i].tile_first;
3985 if (i &&
3986 (info[i].tile_len != info[i - 1].tile_len ||
3987 info[i].n_parallel != info[i - 1].n_parallel))
3988 j++;
3990 info[i].prefix = extend_range(info[i].prefix,
3991 l, l + 1, j);
3992 info[i].tile_first = l + 1;
3996 /* Select the outermost bands in the elements of the list, align
3997 * their prefix schedules, separate bands with different values
3998 * for tile_len and/or n_parallel and then combine the resulting
3999 * prefix and suffix schedules into a single pair of prefix and
4000 * suffix schedules for the entire list.
4002 static void list_select_outer_band(struct cuda_gen *gen,
4003 __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4005 isl_band *band;
4006 int i;
4007 int n = isl_band_list_n_band(list);
4008 isl_ctx *ctx = isl_band_list_get_ctx(list);
4009 struct band_info *info;
4010 int max_tile_first;
4011 isl_union_map *prefix;
4012 isl_union_map *suffix;
4014 assert(n >= 1);
4015 info = isl_calloc_array(ctx, struct band_info, n);
4016 assert(info);
4018 max_tile_first = 0;
4019 for (i = 0; i < n; ++i) {
4020 band = isl_band_list_get_band(list, i);
4021 band_select_outer_band(gen, band, pos, &info[i]);
4022 if (info[i].tile_first > max_tile_first)
4023 max_tile_first = info[i].tile_first;
4026 for (i = 0; i < n; ++i) {
4027 if (info[i].tile_first == max_tile_first)
4028 continue;
4029 info[i].prefix = extend_range(info[i].prefix,
4030 info[i].tile_first, max_tile_first, 0);
4031 info[i].tile_first = max_tile_first;
4034 qsort(info, n, sizeof(struct band_info), &cmp_band);
4036 for (i = 0; i < n - 1; ++i)
4037 if (info[i].tile_len != info[i + 1].tile_len ||
4038 info[i].n_parallel != info[i + 1].n_parallel)
4039 break;
4041 if (i < n -1)
4042 separate_bands(info, n);
4044 prefix = info[0].prefix;
4045 suffix = info[0].suffix;
4047 for (i = 1; i < n; ++i) {
4048 prefix = isl_union_map_union(prefix, info[i].prefix);
4049 suffix = isl_union_map_union(suffix, info[i].suffix);
4052 list_info->tile_first = info[0].tile_first;
4053 list_info->tile_len = -1;
4054 list_info->prefix = prefix;
4055 list_info->suffix = suffix;
4057 isl_band_list_free(list);
4058 free(info);
4061 /* Set max_out to the maximal number of output dimensions over
4062 * all maps.
4064 static int update_max_out(__isl_take isl_map *map, void *user)
4066 int *max_out = user;
4067 int n_out = isl_map_dim(map, isl_dim_out);
4069 if (n_out > *max_out)
4070 *max_out = n_out;
4072 isl_map_free(map);
4073 return 0;
4076 struct align_range_data {
4077 int max_out;
4078 isl_union_map *res;
4081 /* Extend the dimension of the range of the given map to data->max_out and
4082 * then add the result to data->res.
4084 static int map_align_range(__isl_take isl_map *map, void *user)
4086 struct align_range_data *data = user;
4087 int i;
4088 isl_space *dim;
4089 isl_map *proj;
4090 int n_out = isl_map_dim(map, isl_dim_out);
4092 dim = isl_union_map_get_space(data->res);
4093 proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4094 for (i = n_out; i < data->max_out; ++i)
4095 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4097 map = isl_map_apply_range(map, proj);
4099 data->res = isl_union_map_add_map(data->res, map);
4101 return 0;
4104 /* Extend the ranges of the maps in the union map such they all have
4105 * the same dimension.
4107 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4109 struct align_range_data data;
4111 data.max_out = 0;
4112 isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4114 data.res = isl_union_map_empty(isl_union_map_get_space(umap));
4115 isl_union_map_foreach_map(umap, &map_align_range, &data);
4117 isl_union_map_free(umap);
4118 return data.res;
4121 /* Select the outermost tilable band that (by construction)
4122 * has at least one parallel loop.
4123 * The starting position of the aligned band is stored in the pair
4124 * gen->tile_first.
4125 * The sizes and number of parallel loops may be different in different
4126 * parts of the band forest and are therefore stored in the cuda_stmts.
4128 * Return the complete schedule, with the tilable bands aligned
4129 * at gen->tile_first and padded with zero, if needed.
4131 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4132 __isl_keep isl_schedule *schedule)
4134 isl_band_list *list;
4135 struct band_info info;
4137 gen->n_parallel = 0;
4138 gen->tile_len = -1;
4140 list = isl_schedule_get_band_forest(schedule);
4142 list_select_outer_band(gen, list, 0, &info);
4144 gen->tile_first = info.tile_first;
4145 info.suffix = align_range(info.suffix);
4147 return isl_union_map_flat_range_product(info.prefix, info.suffix);
4150 /* Set gen->untiled_len to the number of scheduling dimensions
4151 * for the schedule of the first domain.
4152 * We assume here that this number is the same for all domains.
4154 static int set_untiled_len(__isl_take isl_map *map, void *user)
4156 unsigned *untiled_len = user;
4158 *untiled_len = isl_map_dim(map, isl_dim_out);
4160 isl_map_free(map);
4161 return -1;
4164 /* Compute an appropriate schedule based on the accesses in
4165 * gen->read and gen->write.
4167 * We first compute dependences and then use those to compute
4168 * a schedule that has a parallel loop in each tilable band.
4169 * Finally, we select the outermost tilable band.
4171 static void compute_schedule(struct cuda_gen *gen,
4172 __isl_take isl_union_map *sched)
4174 isl_ctx *ctx = isl_union_map_get_ctx(sched);
4175 isl_union_set *domain;
4176 isl_union_map *empty;
4177 isl_union_map *dep_raw, *dep2, *dep3, *dep;
4178 isl_union_map *uninitialized;
4179 isl_schedule *schedule;
4181 empty = isl_union_map_empty(isl_union_map_get_space(sched));
4183 isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4184 isl_union_map_copy(gen->write), empty,
4185 isl_union_map_copy(sched),
4186 &dep_raw, NULL, &uninitialized, NULL);
4187 isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4188 isl_union_map_copy(gen->write),
4189 isl_union_map_copy(gen->read),
4190 isl_union_map_copy(sched),
4191 &dep2, &dep3, NULL, NULL);
4192 isl_union_map_free(sched);
4194 gen->copy_in = isl_union_map_range(uninitialized);
4196 dep = isl_union_map_union(dep2, dep3);
4197 dep = isl_union_map_union(dep, dep_raw);
4198 dep = isl_union_map_coalesce(dep);
4200 domain = extract_domain(gen);
4201 schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4202 isl_union_map_copy(dep), dep);
4204 sched = select_outer_tilable_band(gen, schedule);
4206 isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4207 sched = isl_union_map_intersect_domain(sched, domain);
4208 gen->sched = sched;
4210 isl_schedule_free(schedule);
4213 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4214 struct cuda_stmt_access **next_access)
4216 struct cuda_stmt_access *access;
4217 isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4219 access = isl_alloc_type(ctx, struct cuda_stmt_access);
4220 assert(access);
4221 access->next = NULL;
4222 access->read = expr->acc.read;
4223 access->write = expr->acc.write;
4224 access->access = isl_map_copy(expr->acc.access);
4226 *next_access = access;
4227 next_access = &(*next_access)->next;
4228 return next_access;
4231 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4232 struct cuda_stmt_access **next_access)
4234 int i;
4236 for (i = 0; i < expr->n_arg; ++i)
4237 next_access = expr_extract_accesses(expr->args[i],
4238 next_access);
4240 if (expr->type == pet_expr_access)
4241 next_access = expr_extract_access(expr, next_access);
4243 return next_access;
4246 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4248 struct cuda_stmt_access **next_access = &stmt->accesses;
4250 stmt->accesses = NULL;
4251 expr_extract_accesses(stmt->body, next_access);
4254 /* Return an array of cuda_stmt representing the statements in "scop".
4256 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4257 __isl_keep isl_set *context)
4259 int i;
4260 struct cuda_stmt *stmts;
4262 stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4263 assert(stmts);
4265 for (i = 0; i < scop->n_stmt; ++i) {
4266 struct cuda_stmt *s = &stmts[i];
4268 s->domain = isl_set_copy(scop->stmts[i]->domain);
4269 s->domain = isl_set_intersect_params(s->domain,
4270 isl_set_copy(context));
4271 s->body = scop->stmts[i]->body;
4272 pet_stmt_extract_accesses(s);
4275 return stmts;
4278 /* Replace the scop in the "input" file by equivalent code
4279 * that uses the GPU. "scop" is assumed to correspond to this scop.
4281 * We first compute a schedule that respects the dependences
4282 * of the original program and select the outermost band
4283 * of tilable dimensions that has at least one parallel loop.
4284 * We then have three blocks of dimensions
4286 * H B G
4288 * The tilable band "B" is first tiled according to "tile" sizes, resulting
4289 * in
4291 * H T P G
4293 * For each iteration of the T loop and for each array, we compute
4294 * the array elements accessed by that iteration, construct a rectangular
4295 * box around it and shift it to the origin. The result is used
4296 * as shared memory for the array.
4298 * We then split off at most 2 parallel loops from the T loops and
4299 * at most 3 parallel loops from the P loops
4301 * H T1 T2 P1 P2 G
4303 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4304 * according to "grid"/"block" sizes.
4306 * H T1T T1P T2 P1T P1P P2 G
4308 * Finally, the T1P and P1P iterators are equated to the block and
4309 * thread dimensions respectively and so are effectively removed.
4310 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4311 * are run on the GPU.
4313 * Code is generated in three stages. We first generate code for the
4314 * host (the H loops), with iterators h%d. Then, for each leaf node
4315 * of the resulting AST, we generate code for the shared loops (up to
4316 * and including T2), with iterators g%d and after equating the H loops
4317 * to h%d parameters and the T1P loops to the block dimensions.
4318 * Finally, we generate code for the remaining loops in a similar fashion.
4320 int cuda_pet(isl_ctx *ctx, struct pet_scop *scop, struct ppcg_options *options,
4321 const char *input)
4323 isl_union_map *sched;
4324 struct cuda_gen gen;
4326 if (!scop)
4327 return -1;
4329 scop = pet_scop_align_params(scop);
4331 gen.ctx = ctx;
4332 gen.context = isl_set_copy(scop->context);
4333 gen.context = add_context_from_str(gen.context, options->ctx);
4334 gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4335 gen.n_stmts = scop->n_stmt;
4336 gen.stmts = extract_stmts(ctx, scop, gen.context);
4337 gen.read = pet_scop_collect_reads(scop);
4338 gen.write = pet_scop_collect_writes(scop);
4339 gen.options = options;
4340 gen.state = cloog_isl_state_malloc(gen.ctx);
4341 gen.scop = scop;
4343 cuda_open_files(&gen.cuda, input);
4345 collect_array_info(&gen);
4347 sched = pet_scop_collect_schedule(scop);
4349 compute_schedule(&gen, sched);
4351 print_host_code(&gen);
4353 cloog_state_free(gen.state);
4354 clear_cuda_gen(&gen);
4356 cuda_close_files(&gen.cuda);
4358 return 0;