cuda_pet: check for NULL scop
[ppcg.git] / cuda.c
blob2e92500c95aeb95a6eebfe6606f37a4920897565
1 /*
2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
8 * 91893 Orsay, France
9 */
11 #include <assert.h>
12 #include <stdlib.h>
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
16 #include <isl/aff.h>
17 #include <isl/ilp.h>
18 #include <isl/flow.h>
19 #include <isl/band.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
24 #include "cuda.h"
25 #include "cuda_common.h"
26 #include "gpucode.h"
27 #include "schedule.h"
28 #include "ppcg_options.h"
30 /* The fields stride, shift and shift_map only contain valid information
31 * if shift != NULL.
32 * If so, they express that current index is such that if you add shift,
33 * then the result is always a multiple of stride.
34 * shift_map contains the mapping
36 * i -> (i + shift)/stride
38 struct cuda_array_bound {
39 isl_int size;
40 isl_aff *lb;
42 isl_int stride;
43 isl_qpolynomial *shift;
44 isl_basic_map *shift_map;
47 struct cuda_array_info;
49 /* A group of array references in a kernel that should be handled together.
50 * If private_bound is not NULL, then it is mapped to registers.
51 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
52 * Otherwise, it is accesses from global memory.
54 struct cuda_array_ref_group {
55 /* The references in this group access this array. */
56 struct cuda_array_info *array;
57 /* Position of this group in the list of reference groups of array. */
58 int nr;
60 /* The following fields are use during the construction of the groups.
61 * access is the combined access relation relative to the shared
62 * memory tiling.
63 * write is set if any access in the group is a write.
65 isl_map *access;
66 int write;
68 /* For each index, size and offset of piece in shared memory. */
69 struct cuda_array_bound *shared_bound;
71 /* For each index, size and offset of piece in private memory. */
72 struct cuda_array_bound *private_bound;
74 /* References in this group; point to elements of a linked list. */
75 int n_ref;
76 struct cuda_stmt_access **refs;
79 struct cuda_array_info {
80 isl_dim *dim;
81 /* Element type. */
82 char *type;
83 /* Name of the array. */
84 char *name;
85 /* Number of indices. */
86 unsigned n_index;
87 /* For each index, a bound on the array in that direction. */
88 isl_pw_aff **bound;
89 /* For each index, bound[i] specialized to the current kernel. */
90 isl_pw_aff **local_bound;
92 /* All references to this array; point to elements of a linked list. */
93 int n_ref;
94 struct cuda_stmt_access **refs;
96 /* The reference groups associated to this array. */
97 int n_group;
98 struct cuda_array_ref_group **groups;
100 /* Last shared memory tile dimension that affects tile of this array. */
101 int last_shared;
102 /* Dimension at which copying to/from shared memory is printed.
103 * if >= 0, then the value is >= last_shared
104 * if -1, then the copying is done at the leaf level.
106 int print_shared_level;
109 /* Print the name of the local copy of a given group of array references.
111 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
113 int global = 0;
115 if (group->private_bound)
116 fprintf(out, "private_");
117 else if (group->shared_bound)
118 fprintf(out, "shared_");
119 else
120 global = 1;
121 fprintf(out, "%s", group->array->name);
122 if (!global && group->array->n_group > 1)
123 fprintf(out, "_%d", group->nr);
126 /* Collect all references to the given array and store pointers to them
127 * in array->refs.
129 static void collect_references(struct cuda_gen *gen,
130 struct cuda_array_info *array)
132 int i;
133 int n;
135 n = 0;
136 for (i = 0; i < gen->n_stmts; ++i) {
137 struct cuda_stmt *stmt = &gen->stmts[i];
138 struct cuda_stmt_access *access;
140 for (access = stmt->accesses; access; access = access->next) {
141 const char *name;
142 name = isl_map_get_tuple_name(access->access,
143 isl_dim_out);
144 if (name && !strcmp(array->name, name))
145 n++;
149 array->n_ref = n;
150 array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
151 assert(array->refs);
153 n = 0;
154 for (i = 0; i < gen->n_stmts; ++i) {
155 struct cuda_stmt *stmt = &gen->stmts[i];
156 struct cuda_stmt_access *access;
158 for (access = stmt->accesses; access; access = access->next) {
159 const char *name;
160 name = isl_map_get_tuple_name(access->access,
161 isl_dim_out);
162 if (!name || strcmp(array->name, name))
163 continue;
165 array->refs[n++] = access;
170 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
172 int i;
173 struct cuda_array_bound *bound;
175 bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
176 assert(bound);
178 for (i = 0; i < n_index; ++i) {
179 isl_int_init(bound[i].size);
180 bound[i].lb = NULL;
181 isl_int_init(bound[i].stride);
182 bound[i].shift = NULL;
183 bound[i].shift_map = NULL;
186 return bound;
189 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
191 int j;
193 if (!bound)
194 return;
196 for (j = 0; j < n_index; ++j) {
197 isl_int_clear(bound[j].size);
198 isl_int_clear(bound[j].stride);
199 isl_aff_free(bound[j].lb);
200 isl_qpolynomial_free(bound[j].shift);
201 isl_basic_map_free(bound[j].shift_map);
203 free(bound);
206 static struct pet_array *find_array(struct pet_scop *scop,
207 __isl_keep isl_set *accessed)
209 int i;
210 isl_id *id;
212 id = isl_set_get_tuple_id(accessed);
214 for (i = 0; i < scop->n_array; ++i) {
215 isl_id *id_i;
217 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
218 isl_id_free(id_i);
219 if (id == id_i)
220 break;
222 isl_id_free(id);
224 return i < scop->n_array ? scop->arrays[i] : NULL;
227 /* Compute bounds on the host arrays based on the accessed elements
228 * and collect all references to the array.
230 static int extract_array_info(__isl_take isl_set *array, void *user)
232 int i;
233 struct cuda_gen *gen = (struct cuda_gen *)user;
234 const char *name;
235 int n_index;
236 isl_pw_aff **bounds;
237 isl_pw_aff **local_bounds;
238 struct pet_array *pa;
240 n_index = isl_set_dim(array, isl_dim_set);
241 name = isl_set_get_tuple_name(array);
242 bounds = isl_alloc_array(isl_set_get_ctx(array),
243 isl_pw_aff *, n_index);
244 assert(bounds);
245 local_bounds = isl_calloc_array(isl_set_get_ctx(array),
246 isl_pw_aff *, n_index);
247 assert(local_bounds);
248 gen->array[gen->n_array].dim = isl_set_get_dim(array);
249 gen->array[gen->n_array].name = strdup(name);
250 gen->array[gen->n_array].n_index = n_index;
251 gen->array[gen->n_array].bound = bounds;
252 gen->array[gen->n_array].local_bound = local_bounds;
254 pa = find_array(gen->scop, array);
255 assert(pa);
257 gen->array[gen->n_array].type = strdup(pa->element_type);
259 for (i = 0; i < n_index; ++i) {
260 isl_set *dom;
261 isl_local_space *ls;
262 isl_aff *one;
263 isl_pw_aff *bound;
264 isl_set *size = i == 0 ? array : pa->extent;
266 bound = isl_set_dim_max(isl_set_copy(size), i);
267 assert(bound);
268 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
269 ls = isl_local_space_from_dim(isl_set_get_dim(dom));
270 one = isl_aff_zero(ls);
271 one = isl_aff_add_constant_si(one, 1);
272 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
273 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
275 bounds[i] = bound;
278 collect_references(gen, &gen->array[gen->n_array]);
280 gen->n_array++;
282 isl_set_free(array);
283 return 0;
286 void collect_array_info(struct cuda_gen *gen)
288 isl_union_set *arrays;
290 arrays = isl_union_map_range(isl_union_map_copy(gen->read));
291 arrays = isl_union_set_union(arrays,
292 isl_union_map_range(isl_union_map_copy(gen->write)));
293 arrays = isl_union_set_coalesce(arrays);
295 gen->n_array = isl_union_set_n_set(arrays);
296 gen->array = isl_alloc_array(gen->ctx,
297 struct cuda_array_info, gen->n_array);
298 assert(gen->array);
299 gen->n_array = 0;
300 isl_union_set_foreach_set(arrays, &extract_array_info, gen);
301 isl_union_set_free(arrays);
304 static void free_array_info(struct cuda_gen *gen)
306 int i, j;
308 for (i = 0; i < gen->n_array; ++i) {
309 int n_index = gen->array[i].n_index;
310 free(gen->array[i].type);
311 free(gen->array[i].name);
312 for (j = 0; j < n_index; ++j) {
313 isl_pw_aff_free(gen->array[i].bound[j]);
314 isl_pw_aff_free(gen->array[i].local_bound[j]);
316 isl_dim_free(gen->array[i].dim);
317 free(gen->array[i].bound);
318 free(gen->array[i].local_bound);
319 free(gen->array[i].refs);
321 free(gen->array);
324 static void declare_device_arrays(struct cuda_gen *gen)
326 int i;
328 for (i = 0; i < gen->n_array; ++i)
329 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
330 gen->array[i].type, gen->array[i].name);
333 static void print_array_size(struct cuda_gen *gen, FILE *out,
334 struct cuda_array_info *array)
336 int i;
337 isl_printer *prn;
339 prn = isl_printer_to_file(gen->ctx, out);
340 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
341 for (i = 0; i < array->n_index; ++i) {
342 prn = isl_printer_print_str(prn, "(");
343 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
344 prn = isl_printer_print_str(prn, ") * ");
346 prn = isl_printer_print_str(prn, "sizeof(");
347 prn = isl_printer_print_str(prn, array->type);
348 prn = isl_printer_print_str(prn, ")");
349 isl_printer_free(prn);
352 static void allocate_device_arrays(struct cuda_gen *gen)
354 int i;
356 for (i = 0; i < gen->n_array; ++i) {
357 fprintf(gen->cuda.host_c, "cudaMalloc(&dev_%s, ",
358 gen->array[i].name);
359 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
360 fprintf(gen->cuda.host_c, ");\n");
364 static void free_device_arrays(struct cuda_gen *gen)
366 int i;
368 for (i = 0; i < gen->n_array; ++i)
369 fprintf(gen->cuda.host_c, "cudaFree(dev_%s);\n",
370 gen->array[i].name);
373 static void copy_arrays_to_device(struct cuda_gen *gen)
375 int i;
377 for (i = 0; i < gen->n_array; ++i) {
378 isl_dim *dim;
379 isl_set *read_i;
380 int empty;
382 dim = isl_dim_copy(gen->array[i].dim);
383 read_i = isl_union_set_extract_set(gen->copy_in, dim);
384 empty = isl_set_fast_is_empty(read_i);
385 isl_set_free(read_i);
386 if (empty)
387 continue;
389 fprintf(gen->cuda.host_c, "cudaMemcpy(dev_%s, %s, ",
390 gen->array[i].name, gen->array[i].name);
391 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
392 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice);\n");
396 static void copy_arrays_from_device(struct cuda_gen *gen)
398 int i;
399 isl_union_set *write;
400 write = isl_union_map_range(isl_union_map_copy(gen->write));
402 for (i = 0; i < gen->n_array; ++i) {
403 isl_dim *dim;
404 isl_set *write_i;
405 int empty;
407 dim = isl_dim_copy(gen->array[i].dim);
408 write_i = isl_union_set_extract_set(write, dim);
409 empty = isl_set_fast_is_empty(write_i);
410 isl_set_free(write_i);
411 if (empty)
412 continue;
414 fprintf(gen->cuda.host_c, "cudaMemcpy(%s, dev_%s, ",
415 gen->array[i].name, gen->array[i].name);
416 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
417 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost);\n");
420 isl_union_set_free(write);
423 static void read_sizes_from_file(struct cuda_gen *gen, const char *filename,
424 int *sizes, int len)
426 int i;
427 FILE *file;
429 file = fopen(filename, "r");
430 if (!file)
431 return;
433 for (i = 0; i < len; ++i)
434 if (fscanf(file, "%d", &sizes[i]) < 1)
435 break;
437 fclose(file);
440 static void reverse_list(int *list, int len)
442 int i;
443 int t;
445 for (i = 0; 2 * i < len; ++i) {
446 t = list[i];
447 list[i] = list[len - 1 - i];
448 list[len - 1 - i] = t;
452 /* Read user specified sizes from "tile.sizes", "block.sizes" and "grid.sizes"
453 * after filling in some potentially useful defaults.
455 static void read_sizes(struct cuda_gen *gen)
457 int n;
459 gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
460 assert(gen->tile_size);
461 for (n = 0; n < gen->tile_len; ++n)
462 gen->tile_size[n] = gen->options->tile_size;
463 read_sizes_from_file(gen, "tile.sizes", gen->tile_size, gen->tile_len);
465 n = gen->n_parallel;
466 gen->n_block = (n <= 3) ? n : 3;
467 switch (gen->n_block) {
468 case 1:
469 gen->block_dim[0] = 512;
470 break;
471 case 2:
472 gen->block_dim[0] = 32;
473 gen->block_dim[1] = 16;
474 break;
475 default:
476 gen->block_dim[0] = 32;
477 gen->block_dim[1] = 4;
478 gen->block_dim[2] = 4;
479 break;
481 read_sizes_from_file(gen, "block.sizes", gen->block_dim, gen->n_block);
482 reverse_list(gen->block_dim, gen->n_block);
484 gen->n_grid = (n <= 2) ? n : 2;
485 switch (gen->n_grid) {
486 case 1:
487 gen->grid_dim[0] = 65536;
488 break;
489 default:
490 gen->grid_dim[0] = 256;
491 gen->grid_dim[1] = 256;
492 break;
494 read_sizes_from_file(gen, "grid.sizes", gen->grid_dim, gen->n_grid);
495 reverse_list(gen->grid_dim, gen->n_grid);
498 static void free_stmts(struct cuda_stmt *stmts, int n)
500 int i;
502 for (i = 0; i < n; ++i) {
503 struct cuda_stmt_access *access, *next;
505 for (access = stmts[i].accesses; access; access = next) {
506 next = access->next;
507 isl_map_free(access->access);
508 free(access);
511 isl_set_free(stmts[i].domain);
513 free(stmts);
516 void clear_cuda_gen(struct cuda_gen *gen)
518 free_stmts(gen->stmts, gen->n_stmts);
519 free_array_info(gen);
520 isl_set_free(gen->context);
521 isl_union_set_free(gen->copy_in);
522 isl_union_map_free(gen->sched);
523 isl_union_map_free(gen->read);
524 isl_union_map_free(gen->write);
527 static void print_reverse_list(FILE *out, int len, int *list)
529 int i;
531 for (i = 0; i < len; ++i) {
532 if (i)
533 fprintf(out, ", ");
534 fprintf(out, "%d", list[len - 1 - i]);
538 static void print_kernel_launch(struct cuda_gen *gen,
539 __isl_keep isl_union_set *arrays)
541 int i;
542 int first = 1;
543 unsigned nparam;
544 isl_dim *dim;
546 print_indent(gen->code.dst, gen->code.indent);
547 fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
548 gen->kernel_id, gen->kernel_id, gen->kernel_id);
549 fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
550 gen->kernel_id);
551 fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
552 gen->kernel_id);
554 for (i = 0; i < gen->n_array; ++i) {
555 isl_dim *dim;
556 isl_set *arr;
557 int empty;
559 dim = isl_dim_copy(gen->array[i].dim);
560 arr = isl_union_set_extract_set(arrays, dim);
561 empty = isl_set_fast_is_empty(arr);
562 isl_set_free(arr);
563 if (empty)
564 continue;
566 if (!first) {
567 fprintf(gen->code.dst, ", ");
568 fprintf(gen->cuda.kernel_c, ", ");
569 fprintf(gen->cuda.kernel_h, ", ");
572 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
573 fprintf(gen->cuda.kernel_c, "%s *%s",
574 gen->array[i].type, gen->array[i].name);
575 fprintf(gen->cuda.kernel_h, "%s *%s",
576 gen->array[i].type, gen->array[i].name);
578 first = 0;
581 dim = isl_union_set_get_dim(arrays);
582 nparam = isl_dim_size(dim, isl_dim_param);
583 for (i = 0; i < nparam; ++i) {
584 const char *name = isl_dim_get_name(dim, isl_dim_param, i);
585 if (!first) {
586 fprintf(gen->code.dst, ", ");
587 fprintf(gen->cuda.kernel_c, ", ");
588 fprintf(gen->cuda.kernel_h, ", ");
590 fprintf(gen->code.dst, "%s", name);
591 fprintf(gen->cuda.kernel_c, "int %s", name);
592 fprintf(gen->cuda.kernel_h, "int %s", name);
593 first = 0;
595 isl_dim_free(dim);
597 for (i = 0; i < gen->tile_first; ++i) {
598 if (!first) {
599 fprintf(gen->code.dst, ", ");
600 fprintf(gen->cuda.kernel_c, ", ");
601 fprintf(gen->cuda.kernel_h, ", ");
603 fprintf(gen->code.dst, "h%d", i);
604 fprintf(gen->cuda.kernel_c, "int h%d", i);
605 fprintf(gen->cuda.kernel_h, "int h%d", i);
606 first = 0;
609 fprintf(gen->code.dst, ");\n");
610 fprintf(gen->cuda.kernel_c, ")\n");
611 fprintf(gen->cuda.kernel_h, ");\n");
614 /* Construct a map from a domain of dimensionality "len"
615 * to a domain of dimensionality "len" + "tile_len" that tiles
616 * the "tile_len" coordinates starting at "first".
617 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
618 * "dim" prescribes the parameters.
620 static __isl_give isl_map *tile(__isl_take isl_dim *dim, int len,
621 int first, int tile_len, int *tile_size)
623 int i;
624 isl_int v;
625 isl_basic_map *bmap;
626 isl_constraint *c;
628 isl_int_init(v);
630 dim = isl_dim_add(dim, isl_dim_in, len);
631 dim = isl_dim_add(dim, isl_dim_out, len + tile_len);
632 bmap = isl_basic_map_universe(isl_dim_copy(dim));
634 for (i = 0; i < len - tile_len; ++i) {
635 int j = i < first ? i : i + tile_len;
636 int k = i < first ? i : i + 2 * tile_len;
638 c = isl_equality_alloc(isl_dim_copy(dim));
639 isl_int_set_si(v, -1);
640 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
641 isl_int_set_si(v, 1);
642 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
643 bmap = isl_basic_map_add_constraint(bmap, c);
646 for (i = 0; i < tile_len; ++i) {
647 c = isl_equality_alloc(isl_dim_copy(dim));
648 isl_int_set_si(v, -1);
649 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
650 isl_int_set_si(v, tile_size[i]);
651 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
652 isl_int_set_si(v, 1);
653 isl_constraint_set_coefficient(c, isl_dim_out,
654 first + i + tile_len, v);
655 bmap = isl_basic_map_add_constraint(bmap, c);
657 c = isl_inequality_alloc(isl_dim_copy(dim));
658 isl_int_set_si(v, 1);
659 isl_constraint_set_coefficient(c, isl_dim_out,
660 first + i + tile_len, v);
661 bmap = isl_basic_map_add_constraint(bmap, c);
663 c = isl_inequality_alloc(isl_dim_copy(dim));
664 isl_int_set_si(v, -1);
665 isl_constraint_set_coefficient(c, isl_dim_out,
666 first + i + tile_len, v);
667 isl_int_set_si(v, tile_size[i] - 1);
668 isl_constraint_set_constant(c, v);
669 bmap = isl_basic_map_add_constraint(bmap, c);
672 isl_dim_free(dim);
673 isl_int_clear(v);
675 return isl_map_from_basic_map(bmap);
678 /* Construct a map from a domain of dimensionality "len"
679 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
680 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
681 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
682 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
683 * that are projected out at the end.
684 * "dim" prescribes the parameters.
686 static __isl_give isl_map *wrap(__isl_take isl_dim *dim, int len,
687 int first, int wrap_len, int *wrap_size)
689 int i;
690 isl_basic_map *bmap;
691 isl_constraint *c;
693 dim = isl_dim_add(dim, isl_dim_in, len);
694 dim = isl_dim_add(dim, isl_dim_out, len + 2 * wrap_len);
695 bmap = isl_basic_map_universe(isl_dim_copy(dim));
697 for (i = 0; i < len; ++i) {
698 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
700 c = isl_equality_alloc(isl_dim_copy(dim));
701 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
702 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
703 bmap = isl_basic_map_add_constraint(bmap, c);
706 for (i = 0; i < wrap_len; ++i) {
707 c = isl_equality_alloc(isl_dim_copy(dim));
708 isl_constraint_set_coefficient_si(c, isl_dim_out,
709 first + i, -1);
710 isl_constraint_set_coefficient_si(c, isl_dim_out,
711 first + wrap_len + i, 1);
712 isl_constraint_set_coefficient_si(c, isl_dim_out,
713 first + 2 * wrap_len + i, wrap_size[i]);
714 bmap = isl_basic_map_add_constraint(bmap, c);
716 c = isl_inequality_alloc(isl_dim_copy(dim));
717 isl_constraint_set_coefficient_si(c, isl_dim_out,
718 first + wrap_len + i, 1);
719 bmap = isl_basic_map_add_constraint(bmap, c);
721 c = isl_inequality_alloc(isl_dim_copy(dim));
722 isl_constraint_set_coefficient_si(c, isl_dim_out,
723 first + wrap_len + i, -1);
724 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
725 bmap = isl_basic_map_add_constraint(bmap, c);
728 isl_dim_free(dim);
730 bmap = isl_basic_map_project_out(bmap, isl_dim_out,
731 first + 2 * wrap_len, wrap_len);
733 return isl_map_from_basic_map(bmap);
736 /* Add "n" parameters named prefix%d.
738 static __isl_give isl_set *add_params( __isl_take isl_set *set,
739 int n, const char *prefix)
741 int i;
742 unsigned nparam;
743 char name[20];
745 nparam = isl_set_dim(set, isl_dim_param);
746 set = isl_set_add_dims(set, isl_dim_param, n);
748 for (i = 0; i < n; ++i) {
749 snprintf(name, sizeof(name), "%s%d", prefix, i);
750 set = isl_set_set_dim_name(set, isl_dim_param,
751 nparam + i, name);
754 return set;
757 /* Equate the "n" dimensions of "set" starting at "first" to
758 * freshly created parameters named prefix%d.
760 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
761 int first, int n, const char *prefix)
763 int i;
764 unsigned nparam;
765 isl_int v;
766 isl_dim *dim;
767 isl_basic_set *bset;
768 isl_constraint *c;
770 nparam = isl_set_dim(set, isl_dim_param);
772 set = add_params(set, n, prefix);
774 dim = isl_set_get_dim(set);
775 bset = isl_basic_set_universe(isl_dim_copy(dim));
777 isl_int_init(v);
779 for (i = 0; i < n; ++i) {
780 c = isl_equality_alloc(isl_dim_copy(dim));
781 isl_int_set_si(v, -1);
782 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
783 isl_int_set_si(v, 1);
784 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
785 bset = isl_basic_set_add_constraint(bset, c);
788 isl_int_clear(v);
789 isl_dim_free(dim);
791 return isl_set_intersect(set, isl_set_from_basic_set(bset));
794 static __isl_give isl_set *parametrization(__isl_take isl_dim *dim,
795 int len, int first, int n, const char *prefix)
797 isl_set *set;
799 dim = isl_dim_add(dim, isl_dim_set, len);
800 set = isl_set_universe(dim);
802 return parametrize(set, first, n, prefix);
805 /* Tile the B loops over the tile sizes and then tile/wrap
806 * the T1 loops over the blocks.
808 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
809 __isl_take isl_union_map *sched)
811 isl_dim *dim;
812 isl_map *tiling, *block_tiling;
814 dim = isl_union_map_get_dim(sched);
815 tiling = tile(isl_dim_copy(dim), gen->untiled_len,
816 gen->tile_first, gen->tile_len, gen->tile_size);
818 if (gen->options->wrap)
819 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
820 gen->tile_first, gen->n_grid, gen->grid_dim);
821 else
822 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
823 gen->tile_first, gen->n_grid, gen->grid_dim);
825 gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
827 tiling = isl_map_apply_range(tiling, block_tiling);
829 sched = isl_union_map_apply_range(sched,
830 isl_union_map_from_map(tiling));
832 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
834 return sched;
837 static __isl_give isl_union_map *parametrize_tiled_schedule(
838 struct cuda_gen *gen, __isl_take isl_union_map *sched)
840 isl_dim *dim;
841 isl_set *par;
843 dim = isl_union_map_get_dim(sched);
844 par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
845 sched = isl_union_map_intersect_range(sched,
846 isl_union_set_from_set(par));
848 dim = isl_union_map_get_dim(sched);
849 par = parametrization(dim, gen->tiled_len,
850 gen->tile_first + gen->n_grid, gen->n_grid, "b");
851 sched = isl_union_map_intersect_range(sched,
852 isl_union_set_from_set(par));
854 return sched;
857 /* Tile/wrap the P1 loops over the threads.
859 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
860 __isl_take isl_union_map *sched)
862 isl_dim *dim;
863 isl_map *tiling;
864 isl_set *par;
866 dim = isl_union_map_get_dim(sched);
868 if (gen->options->wrap)
869 tiling = wrap(isl_dim_copy(dim), gen->tiled_len,
870 gen->shared_len, gen->n_block, gen->block_dim);
871 else
872 tiling = tile(isl_dim_copy(dim), gen->tiled_len,
873 gen->shared_len, gen->n_block, gen->block_dim);
874 gen->thread_tiled_len = gen->tiled_len + gen->n_block;
876 sched = isl_union_map_apply_range(sched,
877 isl_union_map_from_map(tiling));
879 par = parametrization(dim, gen->thread_tiled_len,
880 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
881 gen->n_block, "t");
882 sched = isl_union_map_intersect_range(sched,
883 isl_union_set_from_set(par));
885 gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
887 return sched;
890 /* If the user asked for it, scale the shared memory tile loops
891 * (T1P and T2) of "sched" by gen->tile_size[i].
892 * If we are not performing "wrapping", then additionally scale the T1P
893 * loops by gen->grid_dim[i].
895 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
896 __isl_take isl_union_map *sched)
898 int i;
899 isl_dim *dim;
900 isl_basic_map *scale;
901 isl_constraint *c;
903 if (!gen->options->scale_tile_loops)
904 return sched;
906 dim = isl_union_map_get_dim(sched);
907 dim = isl_dim_add(dim, isl_dim_in, gen->tiled_len);
908 dim = isl_dim_add(dim, isl_dim_out, gen->tiled_len);
909 scale = isl_basic_map_universe(isl_dim_copy(dim));
911 for (i = 0; i < gen->tiled_len; ++i) {
912 int f = 1;
914 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
915 f = gen->tile_size[i - gen->tile_first];
916 if (!gen->options->wrap)
917 f *= gen->grid_dim[i - gen->tile_first];
918 } else if (i >= gen->tile_first + gen->n_grid &&
919 i < gen->tile_first + gen->n_grid + gen->tile_len) {
920 f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
923 c = isl_equality_alloc(isl_dim_copy(dim));
924 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
925 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
926 scale = isl_basic_map_add_constraint(scale, c);
929 isl_dim_free(dim);
931 sched = isl_union_map_apply_range(sched,
932 isl_union_map_from_map(isl_map_from_basic_map(scale)));
934 return sched;
937 /* If we are not performing "wrapping" and if the user asked for it,
938 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
940 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
941 __isl_take isl_union_map *sched)
943 int i;
944 isl_dim *dim;
945 isl_basic_map *scale;
946 isl_constraint *c;
948 if (gen->options->wrap)
949 return sched;
950 if (!gen->options->scale_tile_loops)
951 return sched;
953 dim = isl_union_map_get_dim(sched);
954 dim = isl_dim_add(dim, isl_dim_in, gen->thread_tiled_len);
955 dim = isl_dim_add(dim, isl_dim_out, gen->thread_tiled_len);
956 scale = isl_basic_map_universe(isl_dim_copy(dim));
958 for (i = 0; i < gen->thread_tiled_len; ++i) {
959 int f = 1;
961 if (i >= gen->shared_len &&
962 i < gen->shared_len + gen->n_block)
963 f = gen->block_dim[i - gen->shared_len];
965 c = isl_equality_alloc(isl_dim_copy(dim));
966 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
967 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
968 scale = isl_basic_map_add_constraint(scale, c);
971 isl_dim_free(dim);
973 sched = isl_union_map_apply_range(sched,
974 isl_union_map_from_map(isl_map_from_basic_map(scale)));
976 return sched;
979 /* If we are not performing "wrapping" and if the user asked for it,
980 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
982 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
983 __isl_take isl_union_map *sched, int len, int first, int n_tile)
985 int i;
986 isl_dim *dim;
987 isl_basic_map *scale;
988 isl_constraint *c;
990 if (gen->options->wrap)
991 return sched;
992 if (!gen->options->scale_tile_loops)
993 return sched;
995 dim = isl_union_map_get_dim(sched);
996 dim = isl_dim_add(dim, isl_dim_in, len);
997 dim = isl_dim_add(dim, isl_dim_out, len);
998 scale = isl_basic_map_universe(isl_dim_copy(dim));
1000 for (i = 0; i < len; ++i) {
1001 int f = 1;
1003 if (i >= first && i < first + n_tile)
1004 f = gen->block_dim[i - first];
1006 c = isl_equality_alloc(isl_dim_copy(dim));
1007 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1008 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1009 scale = isl_basic_map_add_constraint(scale, c);
1012 isl_dim_free(dim);
1014 sched = isl_union_map_apply_range(sched,
1015 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1017 return sched;
1020 /* If print_user_stmt is set, we want to print the statements ourselves,
1021 * instead of relying on the C preprocessor. If so, we need to use
1022 * the stop option so that the domains will be saved on the statement
1023 * nodes.
1025 static void print_cloog_shared_body(struct cuda_gen *gen,
1026 __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1027 void (*print_user_stmt)(struct gpucode_info *info,
1028 struct clast_user_stmt *s),
1029 int first_unroll)
1031 int i;
1032 CloogOptions *options;
1033 CloogDomain *cloog_context;
1034 CloogUnionDomain *ud;
1035 CloogInput *input;
1036 struct clast_stmt *stmt;
1037 char name[20];
1039 sched = isl_union_map_copy(sched);
1040 sched = isl_union_map_align_params(sched, isl_set_get_dim(context));
1042 options = cloog_options_malloc(gen->state);
1043 options->language = LANGUAGE_C;
1044 options->strides = 1;
1045 options->sh = 1;
1046 options->f = len;
1047 options->l = -1;
1048 options->override = 1;
1049 options->save_domains = 1;
1050 options->noscalars = 1;
1051 options->first_unroll = first_unroll;
1053 ud = cloog_union_domain_from_isl_union_map(sched);
1054 for (i = 0; i < len; ++i) {
1055 snprintf(name, sizeof(name), "c%d", i);
1056 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1058 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1059 input = cloog_input_alloc(cloog_context, ud);
1061 stmt = cloog_clast_create_from_input(input, options);
1063 gen->stmt_code.indent = gen->kernel_code.indent;
1064 gen->stmt_code.dst = gen->cuda.kernel_c;
1065 gen->stmt_code.print_user_stmt = print_user_stmt;
1066 gen->stmt_code.print_user_stmt_list = NULL;
1067 gen->stmt_code.print_for_head = NULL;
1068 gen->stmt_code.print_for_foot = NULL;
1069 gen->stmt_code.user = gen;
1070 gpu_print_host_stmt(&gen->stmt_code, stmt);
1072 cloog_clast_free(stmt);
1073 cloog_options_free(options);
1076 /* Add "len" parameters p[i] called prefix%d,
1077 * with bounds to 0 <= p[i] < size[i].
1079 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1080 int len, int *size, const char *prefix)
1082 int i;
1083 unsigned nparam;
1084 isl_int v;
1085 isl_dim *dim;
1086 isl_basic_set *bset;
1087 isl_constraint *c;
1088 char name[20];
1090 nparam = isl_set_dim(set, isl_dim_param);
1091 set = isl_set_add_dims(set, isl_dim_param, len);
1093 for (i = 0; i < len; ++i) {
1094 snprintf(name, sizeof(name), "%s%d", prefix, i);
1095 set = isl_set_set_dim_name(set, isl_dim_param,
1096 nparam + i, name);
1099 dim = isl_set_get_dim(set);
1100 bset = isl_basic_set_universe(isl_dim_copy(dim));
1102 isl_int_init(v);
1104 for (i = 0; i < len; ++i) {
1105 c = isl_inequality_alloc(isl_dim_copy(dim));
1106 isl_int_set_si(v, 1);
1107 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1108 bset = isl_basic_set_add_constraint(bset, c);
1110 c = isl_inequality_alloc(isl_dim_copy(dim));
1111 isl_int_set_si(v, -1);
1112 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1113 isl_int_set_si(v, size[i] - 1);
1114 isl_constraint_set_constant(c, v);
1115 bset = isl_basic_set_add_constraint(bset, c);
1118 isl_int_clear(v);
1119 isl_dim_free(dim);
1121 return isl_set_intersect(set, isl_set_from_basic_set(bset));
1124 static void print_shared_body(struct cuda_gen *gen,
1125 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1126 int len, void (*print_user_stmt)(struct gpucode_info *info,
1127 struct clast_user_stmt *s),
1128 int first_unroll)
1130 isl_set *context;
1132 context = isl_set_copy(shared_domain);
1133 context = parametrize(context, 0, gen->shared_len, "g");
1134 context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1135 context = add_bounded_parameters(context,
1136 gen->n_block, gen->block_dim, "t");
1138 print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1139 first_unroll);
1141 isl_set_free(context);
1144 /* Given a tile of an array, construct a map that maps each element
1145 * of the tile to a copy of the tile shifted to the origin
1146 * (based on the lower bounds in group->private_bound or group->shared_bound).
1147 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1148 * is applied to the index first.
1149 * The domain of the resulting map is "access",
1150 * while the range space is anonymous.
1152 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1153 struct cuda_array_ref_group *group)
1155 int i;
1156 isl_dim *dim;
1157 isl_basic_set *bset;
1158 isl_basic_map *bmap;
1159 isl_aff *lb;
1160 isl_basic_set *offset;
1161 isl_basic_map *shift;
1162 isl_basic_map *pre_shift;
1163 isl_map *sched;
1164 const char *name;
1165 struct cuda_array_bound *bounds;
1166 int n_index = group->array->n_index;
1168 bounds = group->private_bound;
1169 if (!bounds)
1170 bounds = group->shared_bound;
1172 dim = isl_set_get_dim(access);
1173 dim = isl_dim_drop(dim, isl_dim_set, 0, n_index);
1174 offset = isl_basic_set_universe(dim);
1175 for (i = 0; i < n_index; ++i) {
1176 lb = isl_aff_copy(bounds[i].lb);
1177 bmap = isl_basic_map_from_aff(lb);
1178 bset = isl_basic_map_range(bmap);
1179 offset = isl_basic_set_flat_product(offset, bset);
1181 offset = isl_basic_set_neg(offset);
1183 dim = isl_dim_map_from_set(isl_set_get_dim(access));
1184 shift = isl_basic_map_identity(dim);
1185 shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1187 bset = isl_basic_set_universe(isl_set_get_dim(access));
1188 bmap = isl_basic_map_from_domain_and_range(bset, offset);
1190 shift = isl_basic_map_sum(shift, bmap);
1192 dim = isl_set_get_dim(access);
1193 dim = isl_dim_drop(dim, isl_dim_set, 0, n_index);
1194 dim = isl_dim_map_from_set(dim);
1195 pre_shift = isl_basic_map_universe(isl_dim_copy(dim));
1196 dim = isl_dim_add(dim, isl_dim_in, 1);
1197 dim = isl_dim_add(dim, isl_dim_out, 1);
1198 for (i = 0; i < n_index; ++i) {
1199 if (!bounds[i].shift_map)
1200 bmap = isl_basic_map_identity(isl_dim_copy(dim));
1201 else
1202 bmap = isl_basic_map_copy(bounds[i].shift_map);
1203 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1205 isl_dim_free(dim);
1206 name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1207 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1208 pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1209 shift = isl_basic_map_apply_range(pre_shift, shift);
1211 sched = isl_map_from_basic_map(shift);
1212 sched = isl_map_intersect_domain(sched, access);
1214 return sched;
1217 /* Construct a schedule for iterating over all elements in the given
1218 * piece of an array. The schedule iterates over a copy of the piece
1219 * that is shifted to the origin.
1220 * We subsequently also perform the tiling/wrapping over the threads.
1222 * In particular, we tile the final iterators so that the final thread
1223 * dimension runs over the final array dimension.
1224 * However, if those final iterators have only a single iteration,
1225 * we try to tile earlier iterators instead.
1227 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1228 __isl_take isl_set *access, struct cuda_array_ref_group *group)
1230 isl_dim *dim;
1231 isl_map *sched;
1232 isl_union_map *usched;
1233 isl_map *tiling;
1234 isl_set *par;
1235 unsigned nvar = isl_set_dim(access, isl_dim_set);
1236 int n_tile;
1237 int first;
1239 sched = shift_access(access, group);
1241 n_tile = gen->n_block;
1242 if (n_tile > nvar) {
1243 int i;
1244 sched = isl_map_insert(sched, isl_dim_out, 0, n_tile - nvar);
1245 for (i = 0; i < n_tile - nvar; ++i)
1246 sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1247 nvar = n_tile;
1250 first = nvar - n_tile;
1252 for (; first > 0; first --)
1253 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1254 first + n_tile - 1, NULL))
1255 break;
1257 dim = isl_map_get_dim(sched);
1258 dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
1259 dim = isl_dim_drop(dim, isl_dim_out, 0, nvar);
1260 if (gen->options->wrap)
1261 tiling = wrap(isl_dim_copy(dim), nvar, first,
1262 n_tile, gen->block_dim);
1263 else
1264 tiling = tile(isl_dim_copy(dim), nvar, first,
1265 n_tile, gen->block_dim);
1266 sched = isl_map_apply_range(sched, tiling);
1268 par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1269 usched = isl_union_map_from_map(sched);
1270 usched = isl_union_map_intersect_range(usched,
1271 isl_union_set_from_set(par));
1273 usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1274 first, n_tile);
1276 return usched;
1279 static void print_shared_access(struct cuda_gen *gen,
1280 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1281 const char *type, struct cuda_array_ref_group *group)
1283 const char *array_name;
1284 char *name;
1285 isl_ctx *ctx;
1286 isl_union_map *sched;
1287 unsigned nvar = isl_set_dim(access, isl_dim_set);
1288 int n_tile;
1290 ctx = isl_set_get_ctx(access);
1291 array_name = isl_set_get_tuple_name(access);
1292 name = isl_alloc_array(ctx, char,
1293 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1294 if (group->array->n_group > 1)
1295 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1296 else
1297 sprintf(name, "%s_shared_%s", type, array_name);
1298 access = isl_set_set_tuple_name(access, name);
1299 free(name);
1301 sched = access_schedule(gen, access, group);
1303 n_tile = gen->n_block;
1304 if (n_tile > nvar)
1305 n_tile = nvar;
1307 print_shared_body(gen, shared_domain, sched, nvar + n_tile, NULL, -1);
1309 isl_union_map_free(sched);
1312 /* Return the union of all read (read = 1) and/or write (write = 1)
1313 * access relations in the group.
1315 static __isl_give isl_union_map *group_access_relation(
1316 struct cuda_array_ref_group *group, int read, int write)
1318 int i;
1319 isl_union_map *access;
1321 access = isl_union_map_empty(isl_map_get_dim(group->access));
1322 for (i = 0; i < group->n_ref; ++i) {
1323 isl_map *map_i;
1325 if (!((read && group->refs[i]->read) ||
1326 (write && group->refs[i]->write)))
1327 continue;
1328 map_i = isl_map_copy(group->refs[i]->access);
1329 access = isl_union_map_union(access,
1330 isl_union_map_from_map(map_i));
1333 return access;
1336 /* Check that none of the shared memory tiles involve any strides.
1338 static int no_strides(struct cuda_array_ref_group *group)
1340 int i;
1341 int n_index = group->array->n_index;
1343 for (i = 0; i < n_index; ++i)
1344 if (group->shared_bound[i].shift)
1345 return 0;
1347 return 1;
1350 /* Return a set containing the values of the given index i
1351 * of the elements in the array tile in global memory that corresponds
1352 * to the shared memory copy.
1353 * In particular, if a is the index, we return a set with constraints
1355 * tile_offset <= a <= tile_offset + tile_size - 1
1357 * and
1359 * 0 <= a <= array_size - 1
1362 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1363 int i)
1365 isl_basic_set *tile;
1366 isl_aff *aff;
1367 isl_constraint *c;
1368 isl_local_space *ls;
1369 isl_pw_aff *bound;
1370 isl_set *dom;
1371 isl_set *tile_set;
1373 aff = isl_aff_copy(group->shared_bound[i].lb);
1374 aff = isl_aff_add_dims(aff, isl_dim_set, 1);
1375 ls = isl_aff_get_local_space(aff);
1376 aff = isl_aff_neg(aff);
1377 aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1378 c = isl_inequality_from_aff(isl_aff_copy(aff));
1379 tile = isl_basic_set_from_constraint(c);
1381 aff = isl_aff_neg(aff);
1382 aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1383 aff = isl_aff_add_constant_si(aff, -1);
1384 c = isl_inequality_from_aff(aff);
1385 tile = isl_basic_set_add_constraint(tile, c);
1387 aff = isl_aff_zero(ls);
1388 aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1389 c = isl_inequality_from_aff(aff);
1390 tile = isl_basic_set_add_constraint(tile, c);
1392 bound = isl_pw_aff_copy(group->array->bound[i]);
1393 bound = isl_pw_aff_add_dims(bound, isl_dim_set, 1);
1394 ls = isl_local_space_from_dim(isl_pw_aff_get_dim(bound));
1395 aff = isl_aff_zero(ls);
1396 aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1397 aff = isl_aff_add_constant_si(aff, 1);
1398 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1400 tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1401 tile_set = isl_set_align_params(tile_set, isl_basic_set_get_dim(tile));
1402 tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1404 return tile_set;
1407 /* Return a set containing the elements in the array tile in
1408 * global memory that corresponds to the shared memory copy.
1410 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1412 int i;
1413 int n_index = group->array->n_index;
1414 isl_set *tile;
1416 tile = group_tile_dim(group, 0);
1417 for (i = 1; i < n_index; ++i) {
1418 isl_set *tile_i;
1420 tile_i = group_tile_dim(group, i);
1421 tile = isl_set_flat_product(tile, tile_i);
1424 tile = isl_set_set_tuple_name(tile, group->array->name);
1426 return tile;
1429 /* Print code for reading into or writing from shared memory
1430 * the given array reference group.
1432 * sched maps the original iteration domains to the shared memory tile loops.
1434 * If we are performing a read from global memory to shared memory,
1435 * if the array involved is not a scalar and if the definition of the
1436 * shared memory tiles does not involve any strides, then we copy
1437 * the entire tile to shared memory. This may result in some extra
1438 * elements getting copied, but it should lead to simpler code
1439 * (which means that fewer registers may be needed) and less divergence.
1441 * Otherwise, we only copy the elements that will be read or have been written
1442 * in the kernel.
1444 * Note that the absence of stride requirement can easily be lifted.
1445 * We would just need to add constraints of the form
1447 * shift + a = stride * alpha
1449 static int print_group_shared_accesses(struct cuda_gen *gen,
1450 struct cuda_array_ref_group *group, const char *type,
1451 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1453 int read;
1454 isl_union_map *access;
1455 isl_union_set *uset;
1456 isl_set *access_set;
1458 if (group->private_bound)
1459 return 0;
1460 if (!group->shared_bound)
1461 return 0;
1463 read = !strcmp(type, "read");
1465 access = group_access_relation(group, read, !read);
1466 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1467 uset = isl_union_map_range(access);
1469 if (isl_union_set_is_empty(uset)) {
1470 isl_union_set_free(uset);
1471 return 0;
1474 if (read && group->array->n_index > 0 && no_strides(group)) {
1475 isl_union_set_free(uset);
1476 access_set = group_tile(group);
1477 print_shared_access(gen, shared_domain, access_set,
1478 type, group);
1479 return 1;
1482 access_set = isl_set_from_union_set(uset);
1483 access_set = isl_set_coalesce(access_set);
1485 print_shared_access(gen, shared_domain, access_set, type, group);
1487 return 1;
1490 /* Print code for reading into or writing from shared memory at
1491 * the given level (-1 for innermost).
1493 * If we are not printing at the innermost level, then the dimensionality
1494 * of shared_domain may be smaller than gen->shared_len.
1495 * As the rest of the code assumes that the domain of access has
1496 * gen->shared_len dimensions, we therefore may need to embed this domain
1497 * in a higher dimensional space after intersection with shared_domain.
1499 static void print_shared_accesses(struct cuda_gen *gen,
1500 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1501 const char *type, int level)
1503 int i, j;
1504 isl_dim *dim;
1505 isl_map *proj;
1506 isl_set *par;
1507 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1508 int sync = 0;
1509 isl_union_map *sched;
1511 shared_domain = isl_set_copy(shared_domain);
1512 sched = isl_union_map_copy(gen->tiled_sched);
1513 dim = isl_union_map_get_dim(sched);
1514 proj = projection(dim, gen->tiled_len, shared_len);
1515 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1516 sched = isl_union_map_intersect_range(sched,
1517 isl_union_set_from_set(isl_set_copy(shared_domain)));
1518 if (shared_len != gen->shared_len) {
1519 dim = isl_union_map_get_dim(sched);
1520 proj = projection(dim, gen->shared_len, shared_len);
1521 proj = isl_map_reverse(proj);
1522 shared_domain = isl_set_apply(shared_domain,
1523 isl_map_copy(proj));
1524 sched = isl_union_map_apply_range(sched,
1525 isl_union_map_from_map(proj));
1528 dim = isl_union_map_get_dim(sched);
1529 par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1530 sched = isl_union_map_intersect_range(sched,
1531 isl_union_set_from_set(par));
1533 for (i = 0; i < gen->n_array; ++i) {
1534 struct cuda_array_info *array = &gen->array[i];
1536 if (gen->array[i].print_shared_level != level)
1537 continue;
1539 for (j = 0; j < array->n_group; ++j) {
1540 if (print_group_shared_accesses(gen, array->groups[j],
1541 type, shared_domain, sched))
1542 sync = 1;
1546 isl_union_map_free(sched);
1547 isl_set_free(shared_domain);
1549 if (sync) {
1550 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1551 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1555 /* Given an index expression into a tile of an array, adjust the expression
1556 * to a shift of the tile to the origin
1557 * (based on the lower bounds in array->shared_bound).
1558 * If the index is strided, then we first add
1559 * bound->shift and divide by bound->stride.
1561 static __isl_give isl_qpolynomial *shift_index(__isl_take isl_qpolynomial *qp,
1562 struct cuda_array_info *array,
1563 struct cuda_array_bound *bound, __isl_take isl_set *domain)
1565 isl_qpolynomial *lb;
1567 if (bound->shift) {
1568 isl_qpolynomial *shift, *t;
1569 isl_int one;
1570 isl_dim *dim;
1571 shift = bound->shift;
1572 shift = isl_qpolynomial_copy(shift);
1573 shift = isl_qpolynomial_drop_dims(shift, isl_dim_set, 0,
1574 isl_qpolynomial_dim(shift, isl_dim_set));
1575 shift = isl_qpolynomial_align_params(shift,
1576 isl_qpolynomial_get_dim(qp));
1577 qp = isl_qpolynomial_add(qp, shift);
1578 dim = isl_qpolynomial_get_dim(qp);
1579 isl_int_init(one);
1580 isl_int_set_si(one, 1);
1581 t = isl_qpolynomial_rat_cst(dim, one, bound->stride);
1582 isl_int_clear(one);
1583 qp = isl_qpolynomial_mul(qp, t);
1586 lb = isl_qpolynomial_from_aff(isl_aff_copy(bound->lb));
1587 lb = isl_qpolynomial_drop_dims(lb, isl_dim_set, 0,
1588 isl_qpolynomial_dim(lb, isl_dim_set));
1590 lb = isl_qpolynomial_align_params(lb, isl_qpolynomial_get_dim(qp));
1592 qp = isl_qpolynomial_sub(qp, lb);
1593 qp = isl_qpolynomial_gist(qp, domain);
1595 return qp;
1598 /* This function is called for each access to an array in some statement
1599 * in the original code.
1600 * Replace that access by an access to shared or (linearized) global memory.
1601 * Since the array in shared memory is just
1602 * a shifted copy of part of the original array, we simply need
1603 * to subtract the lower bound, which was computed
1604 * in can_tile_for_shared_memory.
1605 * If any of the indices is strided, then we first add
1606 * shared_bound[i].shift and divide by shared_bound[i].stride.
1608 * If the given array is accessed directly from global memory,
1609 * we don't need to perform any shifting and simply simplify
1610 * expression in the context of the domain instead.
1612 * If the array space (range of access) has no name, then we are
1613 * accessing an iterator in the original program.
1615 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1616 int group_nr)
1618 int i;
1619 const char *name;
1620 unsigned n_index;
1621 struct cuda_array_info *array = NULL;
1622 isl_printer *prn;
1623 isl_basic_set *aff;
1624 isl_set *data_set;
1625 isl_set *domain;
1626 struct cuda_array_bound *bounds = NULL;
1628 access = isl_map_align_params(access,
1629 isl_set_get_dim(gen->stmt_domain));
1631 data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1633 name = isl_set_get_tuple_name(data_set);
1635 if (!name)
1636 fprintf(gen->cuda.kernel_c, "(");
1637 else {
1638 struct cuda_array_ref_group *group;
1640 for (i = 0; i < gen->n_array; ++i) {
1641 if (strcmp(name, gen->array[i].name))
1642 continue;
1643 array = &gen->array[i];
1645 assert(array);
1646 group = array->groups[group_nr];
1647 bounds = group->private_bound;
1648 if (!bounds)
1649 bounds = group->shared_bound;
1651 print_array_name(gen->cuda.kernel_c, group);
1652 fprintf(gen->cuda.kernel_c, "[");
1656 n_index = isl_set_dim(data_set, isl_dim_set);
1657 aff = isl_set_affine_hull(data_set);
1659 prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
1660 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1662 if (!bounds)
1663 for (i = 0; i + 1 < n_index; ++i)
1664 prn = isl_printer_print_str(prn, "(");
1666 for (i = 0; i < n_index; ++i) {
1667 isl_constraint *c;
1668 isl_qpolynomial *qp;
1669 int ok;
1671 ok = isl_basic_set_has_defining_equality(aff,
1672 isl_dim_out, i, &c);
1673 assert(ok);
1674 qp = isl_qpolynomial_from_constraint(c, isl_dim_out, i);
1675 qp = isl_qpolynomial_drop_dims(qp, isl_dim_set, 0,
1676 isl_qpolynomial_dim(qp, isl_dim_set));
1678 if (!array) {
1679 prn = isl_printer_print_qpolynomial(prn, qp);
1680 isl_qpolynomial_free(qp);
1681 continue;
1684 domain = isl_set_copy(gen->stmt_domain);
1685 domain = isl_set_project_out(domain, isl_dim_set, 0,
1686 isl_set_dim(domain, isl_dim_set));
1687 if (!bounds)
1688 qp = isl_qpolynomial_gist(qp, domain);
1689 else
1690 qp = shift_index(qp, array, &bounds[i], domain);
1692 if (i) {
1693 if (!bounds) {
1694 prn = isl_printer_print_str(prn, ") * (");
1695 prn = isl_printer_print_pw_aff(prn,
1696 array->local_bound[i]);
1697 prn = isl_printer_print_str(prn, ") + ");
1698 } else
1699 prn = isl_printer_print_str(prn, "][");
1701 prn = isl_printer_print_qpolynomial(prn, qp);
1702 isl_qpolynomial_free(qp);
1704 if (!name)
1705 prn = isl_printer_print_str(prn, ")");
1706 else
1707 prn = isl_printer_print_str(prn, "]");
1708 isl_printer_free(prn);
1710 isl_basic_set_free(aff);
1713 static struct cuda_stmt_access *print_expr(struct cuda_gen *gen, FILE *out,
1714 struct pet_expr *expr, struct cuda_stmt_access *access, int outer)
1716 int i;
1718 switch (expr->type) {
1719 case pet_expr_double:
1720 fprintf(out, "%g", expr->d);
1721 break;
1722 case pet_expr_access:
1723 print_access(gen, isl_map_copy(access->access), access->group);
1724 access = access->next;
1725 break;
1726 case pet_expr_unary:
1727 if (!outer)
1728 fprintf(out, "(");
1729 fprintf(out, " %s ", pet_op_str(expr->op));
1730 access = print_expr(gen, out, expr->args[pet_un_arg],
1731 access, 0);
1732 if (!outer)
1733 fprintf(out, ")");
1734 break;
1735 case pet_expr_binary:
1736 if (!outer)
1737 fprintf(out, "(");
1738 access = print_expr(gen, out, expr->args[pet_bin_lhs],
1739 access, 0);
1740 fprintf(out, " %s ", pet_op_str(expr->op));
1741 access = print_expr(gen, out, expr->args[pet_bin_rhs],
1742 access, 0);
1743 if (!outer)
1744 fprintf(out, ")");
1745 break;
1746 case pet_expr_ternary:
1747 if (!outer)
1748 fprintf(out, "(");
1749 access = print_expr(gen, out, expr->args[pet_ter_cond],
1750 access, 0);
1751 fprintf(out, " ? ");
1752 access = print_expr(gen, out, expr->args[pet_ter_true],
1753 access, 0);
1754 fprintf(out, " : ");
1755 access = print_expr(gen, out, expr->args[pet_ter_false],
1756 access, 0);
1757 if (!outer)
1758 fprintf(out, ")");
1759 break;
1760 case pet_expr_call:
1761 fprintf(out, "%s(", expr->name);
1762 for (i = 0; i < expr->n_arg; ++i) {
1763 if (i)
1764 fprintf(out, ", ");
1765 access = print_expr(gen, out, expr->args[i],
1766 access, 1);
1768 fprintf(out, ")");
1770 return access;
1773 static void print_stmt_body(struct cuda_gen *gen,
1774 FILE *out, struct cuda_stmt *stmt)
1776 print_expr(gen, out, stmt->body, stmt->accesses, 1);
1777 fprintf(out, ";\n");
1780 /* This function is called for each leaf in the innermost clast,
1781 * i.e., for each statemetn.
1782 * We print the statement body, simplifying the accesses based
1783 * on the schedule.
1785 static void print_statement(struct gpucode_info *code,
1786 struct clast_user_stmt *u)
1788 struct cuda_gen *gen = code->user;
1789 isl_dim *dim;
1790 isl_set *par;
1791 isl_set *stmt_domain;
1792 isl_union_map *stmt_sched;
1793 isl_union_set *uset;
1794 int nr;
1795 struct cuda_stmt *stmt;
1797 nr = atoi(u->statement->name + 2);
1798 stmt = &gen->stmts[nr];
1800 stmt_domain = extract_host_domain(u);
1802 stmt_sched = isl_union_map_intersect_range(
1803 isl_union_map_copy(gen->local_sched),
1804 isl_union_set_from_set(extend(stmt_domain,
1805 gen->thread_tiled_len)));
1806 dim = isl_union_map_get_dim(stmt_sched);
1807 par = parametrization(dim, gen->thread_tiled_len, 0,
1808 gen->thread_tiled_len, "c");
1809 stmt_sched = isl_union_map_intersect_range(stmt_sched,
1810 isl_union_set_from_set(par));
1812 uset = isl_union_map_domain(stmt_sched);
1813 dim = isl_union_set_get_dim(uset);
1814 dim = isl_dim_add(dim, isl_dim_set,
1815 isl_set_dim(stmt->domain, isl_dim_set));
1816 dim = isl_dim_set_tuple_name(dim, isl_dim_set, u->statement->name);
1817 gen->stmt_domain = isl_union_set_extract_set(uset, dim);
1818 isl_union_set_free(uset);
1820 print_indent(code->dst, code->indent);
1821 print_stmt_body(gen, code->dst, stmt);
1823 isl_set_free(gen->stmt_domain);
1826 /* Print an access to the element in the global memory copy of the
1827 * given array that corresponds to element [qp[0]][qp[1]]...
1828 * of the original array.
1829 * The copy in global memory has been linearized, so we need to take
1830 * the array size into account.
1832 static void print_private_global_index(isl_ctx *ctx, FILE *out,
1833 struct cuda_array_info *array, __isl_keep isl_qpolynomial **qp)
1835 int i;
1836 isl_printer *prn;
1838 fprintf(out, "%s[", array->name);
1839 prn = isl_printer_to_file(ctx, out);
1840 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1841 for (i = 0; i + 1 < array->n_index; ++i)
1842 prn = isl_printer_print_str(prn, "(");
1843 for (i = 0; i < array->n_index; ++i) {
1844 if (i) {
1845 prn = isl_printer_print_str(prn, ") * (");
1846 prn = isl_printer_print_pw_aff(prn,
1847 array->local_bound[i]);
1848 prn = isl_printer_print_str(prn, ") + ");
1850 prn = isl_printer_print_qpolynomial(prn, qp[i]);
1852 isl_printer_free(prn);
1853 fprintf(out, "]");
1856 /* Print an access to the element in the shared memory copy of the
1857 * given array reference group that corresponds to element [qps[0]][qps[1]]...
1858 * of the original array.
1859 * Since the array in shared memory is just a shifted copy of part
1860 * of the original array, we simply need to subtract the lower bound,
1861 * which was computed in can_tile_for_shared_memory.
1862 * If any of the indices is strided, then we first add
1863 * shared_bound[i].shift and divide by shared_bound[i].stride.
1865 static void print_private_local_index(isl_ctx *ctx, FILE *out,
1866 struct cuda_array_ref_group *group,
1867 __isl_keep isl_qpolynomial **qps, __isl_keep isl_set *domain)
1869 int i;
1870 isl_printer *prn;
1871 struct cuda_array_info *array = group->array;
1872 struct cuda_array_bound *bounds = group->private_bound;
1874 print_array_name(out, group);
1875 for (i = 0; i < array->n_index; ++i) {
1876 isl_qpolynomial *qp = isl_qpolynomial_copy(qps[i]);
1878 qp = shift_index(qp, array, &bounds[i], isl_set_copy(domain));
1880 fprintf(out, "[");
1881 prn = isl_printer_to_file(ctx, out);
1882 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1883 prn = isl_printer_print_qpolynomial(prn, qp);
1884 isl_printer_free(prn);
1885 fprintf(out, "]");
1886 isl_qpolynomial_free(qp);
1890 /* This function is called for each leaf in the clast of the code
1891 * for copying to or from private memory.
1892 * The statement name is read_private_<array> or write_private_<array>.
1894 * The schedule iterates over the array elements, so we can use
1895 * the domain of private_sched at the current scheduling position
1896 * as the index of the array.
1898 static void print_private_copy_statement(struct gpucode_info *code,
1899 struct clast_user_stmt *u)
1901 struct cuda_gen *gen = code->user;
1902 isl_set *domain;
1903 isl_map *sched;
1904 struct cuda_array_ref_group *group = gen->private_group;
1905 int i;
1906 unsigned n_in;
1907 unsigned n_out;
1908 isl_dim *dim;
1909 isl_set *param;
1910 isl_set *index;
1911 isl_basic_set *aff;
1912 isl_ctx *ctx;
1913 isl_qpolynomial **qp;
1914 int read;
1916 read = !strncmp(u->statement->name, "read", 4);
1918 domain = extract_host_domain(u);
1919 assert(domain);
1921 sched = isl_map_copy(gen->private_sched);
1922 sched = isl_map_reverse(sched);
1923 sched = isl_map_intersect_domain(sched, domain);
1924 n_in = isl_map_dim(sched, isl_dim_in);
1925 n_out = isl_map_dim(sched, isl_dim_out);
1926 dim = isl_map_get_dim(sched);
1927 dim = isl_dim_drop(dim, isl_dim_in, 0, n_in);
1928 dim = isl_dim_drop(dim, isl_dim_out, 0, n_out);
1929 param = parametrization(dim, n_in, 0, n_in, "c");
1930 sched = isl_map_align_params(sched, isl_set_get_dim(param));
1931 sched = isl_map_intersect_domain(sched, param);
1932 index = isl_map_range(sched);
1933 domain = isl_set_copy(index);
1934 aff = isl_set_affine_hull(index);
1935 domain = isl_set_project_out(domain, isl_dim_set, 0, n_out);
1937 ctx = isl_basic_set_get_ctx(aff);
1938 qp = isl_alloc_array(ctx, isl_qpolynomial *, n_out);
1939 assert(qp);
1941 for (i = 0; i < n_out; ++i) {
1942 isl_constraint *c;
1943 int ok;
1945 ok = isl_basic_set_has_defining_equality(aff,
1946 isl_dim_set, i, &c);
1947 assert(ok);
1948 qp[i] = isl_qpolynomial_from_constraint(c, isl_dim_set, i);
1949 qp[i] = isl_qpolynomial_drop_dims(qp[i], isl_dim_set, 0, n_out);
1952 print_indent(code->dst, code->indent);
1953 if (read) {
1954 print_private_local_index(ctx, code->dst, group, qp, domain);
1955 fprintf(code->dst, " = ");
1956 print_private_global_index(ctx, code->dst, group->array, qp);
1957 } else {
1958 print_private_global_index(ctx, code->dst, group->array, qp);
1959 fprintf(code->dst, " = ");
1960 print_private_local_index(ctx, code->dst, group, qp, domain);
1962 fprintf(code->dst, ";\n");
1964 for (i = 0; i < n_out; ++i)
1965 isl_qpolynomial_free(qp[i]);
1966 free(qp);
1968 isl_basic_set_free(aff);
1969 isl_set_free(domain);
1972 static void print_private_access(struct cuda_gen *gen,
1973 __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1974 const char *type, struct cuda_array_ref_group *group)
1976 const char *array_name;
1977 char *name;
1978 isl_ctx *ctx;
1979 unsigned nvar = isl_set_dim(access, isl_dim_set);
1980 isl_union_map *usched;
1982 if (isl_set_fast_is_empty(access)) {
1983 isl_set_free(access);
1984 return;
1987 ctx = isl_set_get_ctx(access);
1988 array_name = isl_set_get_tuple_name(access);
1989 name = isl_alloc_array(ctx, char,
1990 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
1991 if (group->array->n_group > 1)
1992 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
1993 else
1994 sprintf(name, "%s_private_%s", type, array_name);
1995 access = isl_set_set_tuple_name(access, name);
1996 free(name);
1998 gen->private_sched = shift_access(access, group);
1999 gen->private_group = group;
2001 usched = isl_union_map_from_map(isl_map_copy(gen->private_sched));
2002 print_shared_body(gen, shared_domain, usched, nvar,
2003 &print_private_copy_statement, 1);
2004 isl_union_map_free(usched);
2006 isl_map_free(gen->private_sched);
2009 /* Print code for reading into or writing from private memory
2010 * the given array reference group.
2012 * sched maps the original iteration domains to the shared memory tile loops.
2014 static void print_group_private_accesses(struct cuda_gen *gen,
2015 struct cuda_array_ref_group *group,
2016 const char *type, __isl_keep isl_set *shared_domain,
2017 unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2019 int read;
2020 isl_union_map *access;
2021 isl_union_set *uset;
2022 isl_set *access_set;
2024 if (!group->private_bound)
2025 return;
2027 read = !strcmp(type, "read");
2029 access = group_access_relation(group, read, !read);
2030 access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2031 access = isl_union_map_intersect(access,
2032 isl_union_map_copy(gen->private_access));
2033 uset = isl_union_map_range(access);
2035 if (isl_union_set_is_empty(uset)) {
2036 isl_union_set_free(uset);
2037 return;
2040 access_set = isl_set_from_union_set(uset);
2041 access_set = isl_set_coalesce(access_set);
2042 access_set = isl_set_eliminate(access_set, isl_dim_param,
2043 first_shared + shared_len,
2044 gen->shared_len - shared_len);
2046 print_private_access(gen, shared_domain, access_set, type, group);
2049 /* Print code for reading into or writing from private memory at
2050 * the given level (-1 for innermost).
2052 * If we are not printing at the innermost level, then the dimensionality
2053 * of shared_domain may be smaller than gen->shared_len.
2054 * As the rest of the code assumes that the domain of access has
2055 * gen->shared_len dimensions, we therefore may need to embed this domain
2056 * in a higher dimensional space after intersection with shared_domain.
2058 * This code is very similar to print_shared_accesses.
2059 * The main difference is that we to take into account gen->private_access.
2061 static void print_private_accesses(struct cuda_gen *gen,
2062 __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2063 const char *type, int level)
2065 int i, j;
2066 isl_dim *dim;
2067 isl_map *proj;
2068 int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2069 unsigned first_shared;
2070 isl_union_map *sched;
2072 shared_domain = isl_set_copy(shared_domain);
2073 sched = isl_union_map_copy(gen->tiled_sched);
2074 dim = isl_union_map_get_dim(sched);
2075 first_shared = isl_dim_size(dim, isl_dim_param);
2076 proj = projection(dim, gen->tiled_len, shared_len);
2077 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2078 sched = isl_union_map_intersect_range(sched,
2079 isl_union_set_from_set(isl_set_copy(shared_domain)));
2080 if (shared_len != gen->shared_len) {
2081 dim = isl_union_map_get_dim(sched);
2082 proj = projection(dim, gen->shared_len, shared_len);
2083 proj = isl_map_reverse(proj);
2084 shared_domain = isl_set_apply(shared_domain,
2085 isl_map_copy(proj));
2086 sched = isl_union_map_apply_range(sched,
2087 isl_union_map_from_map(proj));
2090 for (i = 0; i < gen->n_array; ++i) {
2091 struct cuda_array_info *array = &gen->array[i];
2093 if (gen->array[i].print_shared_level != level)
2094 continue;
2096 for (j = 0; j < array->n_group; ++j)
2097 print_group_private_accesses(gen, array->groups[j],
2098 type, shared_domain,
2099 first_shared, shared_len, sched);
2102 isl_union_map_free(sched);
2103 isl_set_free(shared_domain);
2106 /* Set unroll[j] if the input dimension j is involved in
2107 * the index expression represented by bmap.
2109 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2111 int i, j;
2112 int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2113 int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2114 int *unroll = user;
2116 for (i = 0; i < n_out; ++i) {
2117 isl_constraint *c;
2118 int ok;
2120 ok = isl_basic_map_has_defining_equality(bmap,
2121 isl_dim_out, i, &c);
2122 assert(ok);
2123 for (j = 0; j < n_in; ++j)
2124 if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2125 unroll[j] = 1;
2126 isl_constraint_free(c);
2129 isl_basic_map_free(bmap);
2130 return 0;
2133 /* Given an array pos mapping input dimensions to the corresponding
2134 * output dimension, construct the corresponding map.
2136 static __isl_give isl_map *permutation(__isl_take isl_dim *dim,
2137 int *pos, int len)
2139 int i;
2140 isl_constraint *c;
2141 isl_basic_map *bmap;
2143 dim = isl_dim_add(dim, isl_dim_in, len);
2144 dim = isl_dim_add(dim, isl_dim_out, len);
2145 bmap = isl_basic_map_universe(isl_dim_copy(dim));
2147 for (i = 0; i < len; ++i) {
2148 c = isl_equality_alloc(isl_dim_copy(dim));
2149 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2150 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2151 bmap = isl_basic_map_add_constraint(bmap, c);
2153 isl_dim_free(dim);
2155 return isl_map_from_basic_map(bmap);
2158 /* Find all loops involved in any of the index expressions for any of
2159 * the private accesses, move them innermost and then mark them as
2160 * requiring unrolling by setting gen->first_unroll.
2161 * The loops involved should all be parallel because of the checks
2162 * we performed in check_private_group_access. Moving them innermost
2163 * is therefore a valid transformation.
2165 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2166 __isl_take isl_union_map *sched)
2168 int i, j;
2169 int unroll[gen->thread_tiled_len];
2170 int perm[gen->thread_tiled_len];
2171 isl_dim *dim;
2172 isl_map *permute;
2173 int len = gen->shared_len + gen->n_parallel + gen->n_block;
2175 gen->first_unroll = -1;
2177 for (i = 0; i < gen->thread_tiled_len; ++i)
2178 unroll[i] = 0;
2179 for (i = 0; i < gen->n_array; ++i) {
2180 struct cuda_array_info *array = &gen->array[i];
2182 for (j = 0; j < array->n_group; ++j) {
2183 isl_union_map *access;
2184 isl_map *acc;
2186 if (!array->groups[j]->private_bound)
2187 continue;
2189 access = group_access_relation(array->groups[j], 1, 1);
2190 access = isl_union_map_apply_domain(access,
2191 isl_union_map_copy(sched));
2193 acc = isl_map_from_union_map(access);
2194 isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2196 isl_map_free(acc);
2200 for (i = 0; i < gen->shared_len; ++i)
2201 if (unroll[i])
2202 return sched;
2204 for (i = gen->shared_len; i < len; ++i)
2205 if (unroll[i])
2206 break;
2208 if (i >= len)
2209 return sched;
2211 for (i = len; i < gen->thread_tiled_len; ++i)
2212 if (unroll[i])
2213 return sched;
2215 j = 0;
2216 for (i = 0; i < gen->thread_tiled_len; ++i)
2217 if (!unroll[i])
2218 perm[i] = j++;
2219 gen->first_unroll = 1 + j;
2220 for (i = 0; i < len; ++i)
2221 if (unroll[i])
2222 perm[i] = j++;
2224 dim = isl_union_map_get_dim(sched);
2225 permute = permutation(dim, perm, gen->thread_tiled_len);
2226 sched = isl_union_map_apply_range(sched,
2227 isl_union_map_from_map(permute));
2229 return sched;
2232 /* This function is called for each leaf in the clast of the kernel code.
2233 * We first specialize the schedule to the site of the leaf and
2234 * print code for reading into shared memory, performing the actual
2235 * computations and writing from shared memory, with the required
2236 * synchronizations.
2238 static void print_kernel_user(struct gpucode_info *code,
2239 struct clast_user_stmt *u)
2241 struct cuda_gen *gen = code->user;
2242 isl_set *shared_domain;
2244 shared_domain = extract_entire_host_domain(u);
2246 print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2248 print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2250 print_shared_body(gen, shared_domain, gen->local_sched,
2251 gen->thread_tiled_len, &print_statement,
2252 gen->first_unroll);
2254 print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2256 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2257 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2259 print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2261 isl_set_free(shared_domain);
2264 /* Check if we need to perform any copying to shared memory at this level
2265 * and if so, print the copying instructions.
2266 * Any array for which we are allowed to print copying instructions at
2267 * this level, but haven't done so already, is printed.
2269 static void print_kernel_for_head(struct gpucode_info *code,
2270 struct clast_for *f)
2272 int i;
2273 struct cuda_gen *gen = code->user;
2274 isl_set *domain;
2275 int level;
2276 int print = 0;
2278 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2279 level = isl_set_dim(domain, isl_dim_set) - 1;
2281 for (i = 0; i < gen->n_array; ++i) {
2282 if (gen->array[i].print_shared_level >= 0)
2283 continue;
2284 if (gen->array[i].last_shared > level)
2285 continue;
2286 gen->array[i].print_shared_level = level;
2287 print = 1;
2290 if (print) {
2291 print_shared_accesses(gen, domain, gen->read, "read", level);
2292 print_private_accesses(gen, domain, gen->read, "read", level);
2295 isl_set_free(domain);
2298 /* Print instructions for copying from shared memory for each array
2299 * for which print_kernel_for_head has added copying instructions
2300 * to shared memory.
2302 static void print_kernel_for_foot(struct gpucode_info *code,
2303 struct clast_for *f)
2305 int i;
2306 struct cuda_gen *gen = code->user;
2307 isl_set *domain;
2308 int level;
2309 int print = 0;
2311 domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2312 level = isl_set_dim(domain, isl_dim_set) - 1;
2314 for (i = 0; i < gen->n_array; ++i) {
2315 if (gen->array[i].print_shared_level != level)
2316 continue;
2317 print = 1;
2318 break;
2321 if (print) {
2322 print_private_accesses(gen, domain, gen->write, "write", level);
2323 print_shared_accesses(gen, domain, gen->write, "write", level);
2326 isl_set_free(domain);
2329 /* Use CLooG to generate code for the outer gen->shared_first loops
2330 * of the local schedule "sched".
2331 * The pretty printing of this code is handled by gpu_print_host_stmt,
2332 * which calls print_kernel_user for each iteration of the shared tile loops.
2334 static void print_cloog_kernel_body(struct cuda_gen *gen,
2335 __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2337 int i;
2338 CloogOptions *options;
2339 CloogDomain *cloog_context;
2340 CloogUnionDomain *ud;
2341 CloogInput *input;
2342 struct clast_stmt *stmt;
2343 char name[20];
2345 sched = isl_union_map_copy(sched);
2346 sched = isl_union_map_align_params(sched, isl_set_get_dim(context));
2348 options = cloog_options_malloc(gen->state);
2349 options->language = LANGUAGE_C;
2350 options->strides = 1;
2351 options->sh = 1;
2352 options->stop = gen->shared_len;
2353 options->f = gen->tiled_len;
2354 options->l = gen->tiled_len;
2355 options->save_domains = 1;
2356 options->noscalars = 1;
2358 ud = cloog_union_domain_from_isl_union_map(sched);
2359 for (i = 0; i < gen->shared_len; ++i) {
2360 snprintf(name, sizeof(name), "g%d", i);
2361 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2363 cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2364 input = cloog_input_alloc(cloog_context, ud);
2366 stmt = cloog_clast_create_from_input(input, options);
2368 gen->kernel_code.indent = 4;
2369 gen->kernel_code.dst = gen->cuda.kernel_c;
2370 gen->kernel_code.print_user_stmt = NULL;
2371 gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2372 gen->kernel_code.print_for_head = &print_kernel_for_head;
2373 gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2374 gen->kernel_code.user = gen;
2375 gpu_print_host_stmt(&gen->kernel_code, stmt);
2377 cloog_clast_free(stmt);
2378 cloog_options_free(options);
2381 static void print_kernel_iterators(struct cuda_gen *gen)
2383 int i;
2384 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2385 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2386 "threadIdx.z" };
2388 if (gen->n_grid > 0) {
2389 print_indent(gen->cuda.kernel_c, 4);
2390 fprintf(gen->cuda.kernel_c, "int ");
2391 for (i = 0; i < gen->n_grid; ++i) {
2392 if (i)
2393 fprintf(gen->cuda.kernel_c, ", ");
2394 fprintf(gen->cuda.kernel_c, "b%d = %s",
2395 i, block_dims[gen->n_grid - 1 - i]);
2397 fprintf(gen->cuda.kernel_c, ";\n");
2400 if (gen->n_block > 0) {
2401 print_indent(gen->cuda.kernel_c, 4);
2402 fprintf(gen->cuda.kernel_c, "int ");
2403 for (i = 0; i < gen->n_block; ++i) {
2404 if (i)
2405 fprintf(gen->cuda.kernel_c, ", ");
2406 fprintf(gen->cuda.kernel_c, "t%d = %s",
2407 i, thread_dims[gen->n_block - 1 - i]);
2409 fprintf(gen->cuda.kernel_c, ";\n");
2413 static void print_group_shared_array(struct cuda_gen *gen,
2414 struct cuda_array_ref_group *group)
2416 int j;
2417 struct cuda_array_bound *bounds;
2419 bounds = group->private_bound;
2420 if (!bounds)
2421 bounds = group->shared_bound;
2422 if (!bounds)
2423 return;
2425 print_indent(gen->cuda.kernel_c, 4);
2426 fprintf(gen->cuda.kernel_c, "%s%s ",
2427 group->private_bound ? "" : "__shared__ ", group->array->type);
2428 print_array_name(gen->cuda.kernel_c, group);
2429 for (j = 0; j < group->array->n_index; ++j) {
2430 fprintf(gen->cuda.kernel_c, "[");
2431 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2432 fprintf(gen->cuda.kernel_c, "]");
2434 fprintf(gen->cuda.kernel_c, ";\n");
2437 static void print_shared_arrays(struct cuda_gen *gen)
2439 int i, j;
2441 for (i = 0; i < gen->n_array; ++i) {
2442 struct cuda_array_info *array = &gen->array[i];
2444 for (j = 0; j < array->n_group; ++j)
2445 print_group_shared_array(gen, array->groups[j]);
2449 static void print_kernel_body(struct cuda_gen *gen,
2450 __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2452 isl_set *context;
2454 context = isl_set_copy(host_domain);
2455 context = parametrize(context, 0, gen->tile_first, "h");
2456 context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2457 context = add_bounded_parameters(context,
2458 gen->n_grid, gen->grid_dim, "b");
2460 print_kernel_iterators(gen);
2461 print_shared_arrays(gen);
2463 fprintf(gen->cuda.kernel_c, "\n");
2465 print_cloog_kernel_body(gen, context, sched);
2467 isl_set_free(context);
2470 /* Given a constraint
2472 * a(p,i) + j = g f(e)
2474 * or -a(p,i) - j = g f(e) if sign < 0,
2475 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2476 * a(p,i) is assumed to be an expression in only the parameters.
2478 static void extract_stride(__isl_keep isl_constraint *c,
2479 struct cuda_array_bound *bound, isl_int stride, int sign)
2481 int i;
2482 isl_int v;
2483 isl_int one;
2484 isl_dim *dim;
2485 unsigned nparam;
2486 isl_qpolynomial *qp;
2488 isl_int_set(bound->stride, stride);
2490 dim = isl_constraint_get_dim(c);
2491 dim = isl_dim_drop(dim, isl_dim_out, 0, 1);
2492 dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
2493 dim = isl_dim_domain(dim);
2495 nparam = isl_dim_size(dim, isl_dim_param);
2497 isl_int_init(v);
2498 isl_int_init(one);
2499 isl_int_set_si(one, 1);
2501 isl_constraint_get_constant(c, &v);
2502 if (sign < 0)
2503 isl_int_neg(v, v);
2504 qp = isl_qpolynomial_rat_cst(isl_dim_copy(dim), v, one);
2506 for (i = 0; i < nparam; ++i) {
2507 isl_qpolynomial *t, *p;
2509 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2510 if (isl_int_is_zero(v))
2511 continue;
2512 if (sign < 0)
2513 isl_int_neg(v, v);
2514 t = isl_qpolynomial_rat_cst(isl_dim_copy(dim), v, one);
2515 p = isl_qpolynomial_var(isl_dim_copy(dim), isl_dim_param, i);
2516 t = isl_qpolynomial_mul(t, p);
2517 qp = isl_qpolynomial_add(qp, t);
2520 isl_dim_free(dim);
2521 isl_int_clear(one);
2522 isl_int_clear(v);
2524 bound->shift = qp;
2527 /* Given an equality constraint of a map with a single output dimension j,
2528 * check if the constraint is of the form
2530 * a(p,i) + j = g f(e)
2532 * with a(p,i) an expression in the parameters and input dimensions
2533 * and f(e) an expression in the existentially quantified variables.
2534 * If so, and if g is larger than any such g from a previously considered
2535 * constraint, then call extract_stride. to record the stride information
2536 * in bound.
2538 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2540 int i;
2541 isl_int v, stride;
2542 unsigned n_div;
2543 struct cuda_array_bound *bound = user;
2545 isl_int_init(v);
2546 isl_int_init(stride);
2548 n_div = isl_constraint_dim(c, isl_dim_div);
2549 isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2551 if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2552 int s = isl_int_sgn(v);
2553 isl_int_set_si(stride, 0);
2554 for (i = 0; i < n_div; ++i) {
2555 isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2556 isl_int_gcd(stride, stride, v);
2558 if (!isl_int_is_zero(stride) &&
2559 isl_int_gt(stride, bound->stride))
2560 extract_stride(c, bound, stride, s);
2563 isl_int_clear(stride);
2564 isl_int_clear(v);
2566 isl_constraint_free(c);
2567 return 0;
2570 /* Given contraints on an array index i, check if we can find
2571 * a shift a(p) and a stride g such that
2573 * a(p) + i = 0 mod g
2575 * If so, record the information in bound and apply the mapping
2576 * i -> (i + a(p))/g to the array index in bounds and return
2577 * the new constraints.
2578 * If not, simply return the original constraints.
2580 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2581 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2583 isl_dim *dim;
2584 isl_basic_map *aff;
2585 isl_basic_map *shift;
2586 isl_qpolynomial *qp, *t;
2587 isl_int one;
2589 isl_int_set_si(bound->stride, -1);
2591 aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2593 isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2595 isl_basic_map_free(aff);
2597 if (isl_int_is_neg(bound->stride))
2598 return bounds;
2600 qp = isl_qpolynomial_copy(bound->shift);
2601 qp = isl_qpolynomial_add_dims(qp, isl_dim_set, 1);
2602 dim = isl_qpolynomial_get_dim(qp);
2603 t = isl_qpolynomial_var(isl_dim_copy(dim), isl_dim_set, 0);
2604 qp = isl_qpolynomial_add(qp, t);
2605 isl_int_init(one);
2606 isl_int_set_si(one, 1);
2607 t = isl_qpolynomial_rat_cst(dim, one, bound->stride);
2608 isl_int_clear(one);
2609 qp = isl_qpolynomial_mul(qp, t);
2610 shift = isl_basic_map_from_qpolynomial(qp);
2612 bound->shift_map = isl_basic_map_copy(shift);
2613 bounds = isl_basic_map_apply_range(bounds, shift);
2615 return bounds;
2618 struct cuda_size_info {
2619 isl_basic_set *bset;
2620 struct cuda_array_bound *bound;
2621 int pos;
2624 /* Given a constraint from the basic set describing the bounds on
2625 * an array index, check if it is a lower bound, say m i >= b(x), and,
2626 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2627 * upper bound. If so, and if this bound is smaller than any bound
2628 * derived from earlier constraints, set the size to this bound on
2629 * the expression and the lower bound to ceil(b(x)/m).
2631 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2633 struct cuda_size_info *size = user;
2634 unsigned nparam;
2635 unsigned n_div;
2636 isl_int v;
2638 nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2639 n_div = isl_constraint_dim(c, isl_dim_div);
2641 if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2642 isl_constraint_free(c);
2643 return 0;
2646 isl_int_init(v);
2648 isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2650 if (isl_int_is_pos(v)) {
2651 isl_aff *aff;
2652 isl_aff *lb;
2653 enum isl_lp_result res;
2655 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2656 aff = isl_aff_ceil(aff);
2658 lb = isl_aff_copy(aff);
2660 aff = isl_aff_neg(aff);
2661 aff = isl_aff_add_coefficient_si(aff, isl_dim_set, size->pos, 1);
2663 res = isl_basic_set_max(size->bset, aff, &v);
2664 isl_aff_free(aff);
2666 if (res == isl_lp_ok) {
2667 isl_int_add_ui(v, v, 1);
2668 if (isl_int_is_neg(size->bound->size) ||
2669 isl_int_lt(v, size->bound->size)) {
2670 isl_int_set(size->bound->size, v);
2671 lb = isl_aff_drop_dims(lb, isl_dim_set,
2672 0, size->pos + 1);
2673 isl_aff_free(size->bound->lb);
2674 size->bound->lb = isl_aff_copy(lb);
2677 isl_aff_free(lb);
2680 isl_int_clear(v);
2681 isl_constraint_free(c);
2683 return 0;
2686 /* Given a basic map "bounds" that maps parameters and input dimensions
2687 * to a single output dimension, look for an expression in the parameters
2688 * and input dimensions such that the range of the output dimension shifted
2689 * by this expression is a constant.
2691 * In particular, we currently only consider lower bounds on the output
2692 * dimension as candidate expressions.
2694 static int compute_array_dim_size(struct cuda_gen *gen,
2695 struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2697 struct cuda_size_info size;
2699 bounds = check_stride(gen, bound, bounds);
2701 isl_int_set_si(bound->size, -1);
2702 bound->lb = NULL;
2704 size.bound = bound;
2705 size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2706 size.bset = isl_basic_map_wrap(bounds);
2707 size.bset = isl_basic_set_flatten(size.bset);
2708 isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2709 &size);
2710 isl_basic_set_free(size.bset);
2712 return isl_int_is_nonneg(bound->size) ? 0 : -1;
2715 /* Check if we can find a shared memory tile for the given array
2716 * based on the given accesses, and if so, put the results
2717 * in array->shared_bound.
2719 * We project the accesses on each index in turn and look for a parametric
2720 * offset such that the size is constant.
2722 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2723 struct cuda_array_info *array, __isl_keep isl_map *access,
2724 struct cuda_array_bound *bounds)
2726 int i;
2728 for (i = 0; i < array->n_index; ++i) {
2729 isl_map *access_i;
2730 isl_basic_map *hull;
2732 access_i = isl_map_copy(access);
2733 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2734 access_i = isl_map_project_out(access_i, isl_dim_out,
2735 1, array->n_index - (i + 1));
2736 access_i = isl_map_compute_divs(access_i);
2737 hull = isl_map_simple_hull(access_i);
2738 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2739 return 0;
2742 return 1;
2745 /* Construct a map with input the shared tile loops and the loops that
2746 * will be wrapped around the threads that relates these later loops
2747 * to the thread indices and the projects them out.
2749 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2751 isl_map *priv;
2752 isl_map *tiling;
2753 isl_map *proj;
2754 isl_set *par;
2755 isl_dim *dim;
2757 dim = isl_union_map_get_dim(gen->shared_sched);
2759 if (gen->options->wrap)
2760 tiling = wrap(isl_dim_copy(dim), gen->shared_len + gen->n_block,
2761 gen->shared_len, gen->n_block, gen->block_dim);
2762 else
2763 tiling = tile(isl_dim_copy(dim), gen->shared_len + gen->n_block,
2764 gen->shared_len, gen->n_block, gen->block_dim);
2766 priv = tiling;
2768 par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2769 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2770 gen->n_block, "t");
2772 priv = isl_map_align_params(priv, isl_set_get_dim(par));
2773 priv = isl_map_intersect_range(priv, par);
2775 dim = isl_map_get_dim(priv);
2776 dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
2777 dim = isl_dim_drop(dim, isl_dim_out, 0, isl_dim_size(dim, isl_dim_out));
2778 proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2779 gen->shared_len);
2781 priv = isl_map_apply_range(priv, proj);
2783 return priv;
2786 /* Construct a map from domain_dim to domain_dim that increments
2787 * the dimension at position "pos" and leaves all other dimensions
2788 * constant.
2790 static __isl_give isl_map *next(__isl_take isl_dim *domain_dim, int pos)
2792 int i;
2793 int len = isl_dim_size(domain_dim, isl_dim_set);
2794 isl_dim *dim;
2795 isl_basic_map *next;
2797 dim = isl_dim_map_from_set(domain_dim);
2798 next = isl_basic_map_universe(isl_dim_copy(dim));
2800 for (i = 0; i < len; ++i) {
2801 isl_constraint *c;
2803 c = isl_equality_alloc(isl_dim_copy(dim));
2804 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2805 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2806 if (i == pos)
2807 isl_constraint_set_constant_si(c, 1);
2808 next = isl_basic_map_add_constraint(next, c);
2811 isl_dim_free(dim);
2813 return isl_map_from_basic_map(next);
2816 /* Check if the given access is coalesced.
2817 * That is, check whether incrementing the dimension that will get
2818 * wrapped over the last thread index results in incrementing
2819 * the last array index.
2821 * This function is only called for access relations without reuse.
2823 static int access_is_coalesced(struct cuda_gen *gen,
2824 __isl_keep isl_union_map *access)
2826 isl_dim *dim;
2827 isl_map *access_map;
2828 isl_map *next_thread_x;
2829 isl_map *next_element;
2830 isl_map *map;
2831 int coalesced;
2833 access = isl_union_map_copy(access);
2834 access = isl_union_map_apply_domain(access,
2835 isl_union_map_copy(gen->tiled_sched));
2836 access_map = isl_map_from_union_map(access);
2838 dim = isl_map_get_dim(access_map);
2839 dim = isl_dim_domain(dim);
2840 next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2842 dim = isl_map_get_dim(access_map);
2843 dim = isl_dim_range(dim);
2844 next_element = next(dim, isl_dim_size(dim, isl_dim_set) - 1);
2846 map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2847 map = isl_map_apply_range(map, access_map);
2849 coalesced = isl_map_is_subset(map, next_element);
2851 isl_map_free(next_element);
2852 isl_map_free(map);
2854 return coalesced;
2857 /* For the given array reference group, check whether the access is private
2858 * to the thread. That is, check that any given array element
2859 * is only accessed by a single thread.
2860 * We compute an access relation that maps the shared tile loop iterators
2861 * and the shared point loop iterators that will be wrapped over the
2862 * threads to the array elements.
2863 * We actually check that those iterators that will be wrapped
2864 * partition the array space. This check is stricter than necessary
2865 * since several iterations may be mapped onto the same thread
2866 * and then they could be allowed to access the same memory elements,
2867 * but our check does not allow this situation.
2869 * We also check that the index expression only depends on parallel
2870 * loops. That way, we can move those loops innermost and unroll them.
2871 * Again, we use a test that is stricter than necessary.
2872 * We actually check whether the index expression only depends
2873 * on the iterators that are wrapped over the threads.
2874 * These are necessarily parallel, but there may be more parallel loops.
2876 * Combining the injectivity of the first test with the single-valuedness
2877 * of the second test, we simply test for bijectivity.
2879 * If it turns out we can use registers, we compute the private memory
2880 * tile size using can_tile_for_shared_memory, after introducing a dependence
2881 * on the thread indices.
2883 * Before performing any of the above computations, we first check
2884 * if there is any reuse on the reference group. If not, we simply
2885 * return. If, moreover, the access is coalesced then we also remove
2886 * the shared memory tiling since we should just use global memory instead.
2888 static void check_private_group_access(struct cuda_gen *gen,
2889 struct cuda_array_ref_group *group)
2891 isl_map *acc;
2892 isl_union_map *access;
2893 int n_index = group->array->n_index;
2895 access = group_access_relation(group, 1, 1);
2896 if (isl_union_map_is_injective(access)) {
2897 if (group->shared_bound && access_is_coalesced(gen, access)) {
2898 free_bound_list(group->shared_bound, n_index);
2899 group->shared_bound = NULL;
2901 isl_union_map_free(access);
2902 return;
2904 access = isl_union_map_apply_domain(access,
2905 isl_union_map_copy(gen->shared_sched));
2907 acc = isl_map_from_union_map(access);
2909 if (!isl_map_is_bijective(acc)) {
2910 isl_map_free(acc);
2911 return;
2914 group->private_bound = create_bound_list(gen->ctx, n_index);
2915 acc = isl_map_align_params(acc, isl_map_get_dim(gen->privatization));
2916 acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2917 if (!can_tile_for_shared_memory(gen, group->array, acc,
2918 group->private_bound)) {
2919 free_bound_list(group->private_bound, n_index);
2920 group->private_bound = NULL;
2923 isl_map_free(acc);
2926 /* Look for the last shared tile loop that affects the offset of the
2927 * shared or private tile and store the result in array->last_shared.
2929 static void set_last_shared(struct cuda_gen *gen,
2930 struct cuda_array_ref_group *group)
2932 int i, j;
2933 struct cuda_array_bound *bounds;
2934 unsigned first_shared = gen->first_shared;
2935 int n_index = group->array->n_index;
2937 bounds = group->private_bound;
2938 if (!bounds)
2939 bounds = group->shared_bound;
2940 if (!bounds)
2941 return;
2943 for (j = gen->shared_len - 1; j >= 0; --j) {
2944 for (i = 0; i < n_index; ++i) {
2945 isl_aff *lb;
2946 isl_qpolynomial *shift;
2948 lb = bounds[i].lb;
2949 if (isl_aff_involves_dims(lb, isl_dim_param,
2950 first_shared + j, 1))
2951 break;
2953 shift = bounds[i].shift;
2954 if (!shift)
2955 continue;
2956 if (isl_qpolynomial_involves_dims(shift, isl_dim_param,
2957 first_shared + j, 1))
2958 break;
2960 if (i < n_index)
2961 break;
2963 group->array->last_shared = j;
2966 /* Compute the sizes of all private arrays for the current kernel,
2967 * as well as the offsets of the private pieces in the original arrays.
2968 * If we cannot or don't want to privatize a given array group,
2969 * we use the shared memory tile sizes computed in
2970 * compute_group_shared_bound instead.
2972 * If a given Array only has a single reference group and if we have
2973 * been able to find a privated or shared tile,
2974 * we also look for the last shared tile loop that affects the offset
2975 * (and therefore the array tile) and store the result in array->last_shared.
2977 * A privatized copy of all access relations from reference groups that
2978 * are mapped to private memory is stored in gen->privatization.
2980 static void compute_private_size(struct cuda_gen *gen)
2982 int i, j;
2983 isl_union_map *private;
2985 private = isl_union_map_empty(isl_union_map_get_dim(gen->shared_sched));
2987 for (i = 0; i < gen->n_array; ++i) {
2988 struct cuda_array_info *array = &gen->array[i];
2990 for (j = 0; j < array->n_group; ++j) {
2991 check_private_group_access(gen, array->groups[j]);
2993 if (!array->groups[j]->private_bound)
2994 continue;
2996 private = isl_union_map_union(private,
2997 group_access_relation(array->groups[j], 1, 1));
3000 array->last_shared = gen->shared_len - 1;
3001 array->print_shared_level = -1;
3003 if (array->n_group != 1)
3004 continue;
3005 set_last_shared(gen, array->groups[0]);
3008 if (isl_union_map_is_empty(private))
3009 isl_union_map_free(private);
3010 else {
3011 isl_union_map *priv;
3013 private = isl_union_map_apply_domain(private,
3014 isl_union_map_copy(gen->shared_sched));
3015 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3016 private = isl_union_map_apply_domain(private, priv);
3017 gen->private_access = private;
3021 /* Fill up the groups array with singleton groups, i.e., one group
3022 * per reference, initializing the array, access, write and refs fields.
3023 * In particular the access field is initialized to the scheduled
3024 * access relation of the array reference.
3026 * Return the number of elements initialized, i.e., the number of
3027 * active references in the current kernel.
3029 static int populate_array_references(struct cuda_gen *gen,
3030 struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3031 struct cuda_array_ref_group **groups)
3033 int i;
3034 int n;
3035 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3037 n = 0;
3038 for (i = 0; i < array->n_ref; ++i) {
3039 isl_union_map *umap;
3040 isl_map *map;
3041 struct cuda_array_ref_group *group;
3042 struct cuda_stmt_access *access = array->refs[i];
3044 map = isl_map_copy(access->access);
3045 umap = isl_union_map_from_map(map);
3046 umap = isl_union_map_apply_domain(umap,
3047 isl_union_map_copy(sched));
3049 if (isl_union_map_is_empty(umap)) {
3050 isl_union_map_free(umap);
3051 continue;
3054 map = isl_map_from_union_map(umap);
3056 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3057 assert(group);
3058 group->array = array;
3059 group->access = map;
3060 group->write = access->write;
3061 group->refs = &array->refs[i];
3063 groups[n++] = group;
3066 return n;
3069 static void free_array_ref_group(struct cuda_array_ref_group *group,
3070 int n_index)
3072 if (!group)
3073 return;
3074 free_bound_list(group->shared_bound, n_index);
3075 free_bound_list(group->private_bound, n_index);
3076 isl_map_free(group->access);
3077 free(group->refs);
3078 free(group);
3081 /* If two groups have overlapping access relations and if one of them
3082 * involves a write, then merge the two groups into one.
3084 * We keep track of the grouping in "leader". leader[j] points to
3085 * an earlier group array element that belongs to the same group,
3086 * or the array element j itself if this element is the first in the group.
3088 * Return the number of group leaders.
3090 static int group_overlapping_writes(int n,
3091 struct cuda_array_ref_group **groups, int *leader)
3093 int i, j;
3094 int n_group = n;
3096 for (i = 0; i < n; ++i) {
3097 int l = i;
3098 groups[l]->n_ref = 1;
3099 for (j = i - 1; j >= 0; --j) {
3100 isl_map *map;
3101 int empty;
3103 if (leader[j] != j)
3104 continue;
3105 if (!groups[l]->write && !groups[j]->write)
3106 continue;
3108 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3109 isl_map_copy(groups[j]->access));
3110 empty = isl_map_is_empty(map);
3111 isl_map_free(map);
3113 if (empty)
3114 continue;
3116 groups[j]->access = isl_map_union(groups[j]->access,
3117 groups[l]->access);
3118 groups[j]->write = 1;
3119 groups[l]->access = NULL;
3120 groups[j]->n_ref += groups[l]->n_ref;
3121 l = leader[l] = j;
3122 n_group--;
3124 leader[i] = l;
3127 return n_group;
3130 /* Compute the size of the shared array corresponding to the given array
3131 * array refrence group, based on the accesses from the current kernel,
3132 * as well as the offset of the shared piece in the original array.
3134 static void compute_group_shared_bound(struct cuda_gen *gen,
3135 struct cuda_array_info *array, struct cuda_array_ref_group *group)
3137 isl_ctx *ctx = isl_dim_get_ctx(array->dim);
3139 group->shared_bound = create_bound_list(ctx, array->n_index);
3140 if (!can_tile_for_shared_memory(gen, array, group->access,
3141 group->shared_bound)) {
3142 free_bound_list(group->shared_bound, array->n_index);
3143 group->shared_bound = NULL;
3147 /* Given an initial grouping of array references and shared memory tiles
3148 * for each group that allows for a shared memory tile, merge two groups
3149 * if both have a shared memory tile and if the merged group also has
3150 * a shared memory tile.
3152 * Return the number of group leaders after merging.
3154 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3155 struct cuda_array_info *array, int n,
3156 struct cuda_array_ref_group **groups, int *leader, int n_group)
3158 int i, j;
3159 isl_ctx *ctx = isl_dim_get_ctx(array->dim);
3161 for (i = 0; n_group > 1 && i < n; ++i) {
3162 int l = i;
3163 if (leader[i] != i)
3164 continue;
3165 if (!groups[i]->shared_bound)
3166 continue;
3167 for (j = i - 1; j >= 0; --j) {
3168 isl_map *map;
3169 int empty;
3170 struct cuda_array_bound *shared_bound;
3172 if (leader[j] != j)
3173 continue;
3174 if (!groups[j]->shared_bound)
3175 continue;
3177 map = isl_map_intersect(isl_map_copy(groups[l]->access),
3178 isl_map_copy(groups[j]->access));
3179 empty = isl_map_is_empty(map);
3180 isl_map_free(map);
3182 if (empty)
3183 continue;
3185 map = isl_map_union(isl_map_copy(groups[l]->access),
3186 isl_map_copy(groups[j]->access));
3187 shared_bound = create_bound_list(ctx, array->n_index);
3188 if (!can_tile_for_shared_memory(gen, array, map,
3189 shared_bound)) {
3190 isl_map_free(map);
3191 free_bound_list(shared_bound, array->n_index);
3192 continue;
3195 free_bound_list(groups[j]->shared_bound,
3196 array->n_index);
3197 groups[j]->shared_bound = shared_bound;
3198 isl_map_free(groups[j]->access);
3199 groups[j]->access = map;
3200 groups[j]->n_ref += groups[l]->n_ref;
3201 l = leader[l] = j;
3202 n_group--;
3206 return n_group;
3209 /* Extract an array of array reference groups from the array of references
3210 * and the grouping information in "leader".
3212 * Store the results in array->n_group and array->groups.
3214 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3215 int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3217 int i, j;
3219 for (i = 2; i < n; ++i)
3220 leader[i] = leader[leader[i]];
3222 array->n_group = n_group;
3223 array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3224 n_group);
3225 assert(array->groups);
3227 j = 0;
3228 for (i = 0; i < n; ++i) {
3229 int k, l;
3230 struct cuda_stmt_access **refs;
3232 if (leader[i] != i) {
3233 groups[i]->refs = NULL;
3234 free_array_ref_group(groups[i], array->n_index);
3235 continue;
3238 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3239 groups[i]->n_ref);
3240 assert(refs);
3241 l = 0;
3242 for (k = i; k < n; ++k)
3243 if (leader[k] == i) {
3244 refs[l++] = *groups[k]->refs;
3245 (*groups[k]->refs)->group = j;
3248 groups[i]->refs = refs;
3249 groups[i]->nr = j;
3250 array->groups[j++] = groups[i];
3254 /* Group array references that should be considered together when
3255 * deciding whether to access them from private, shared or global memory.
3257 * In particular, if two array references overlap and if one of them
3258 * is a write, then the two references are grouped together.
3259 * Furthermore, if two groups admit a shared memory tile and if the
3260 * combination of the two also admits a shared memory tile, we merge
3261 * the two groups.
3263 * During the construction the group->refs field points to a single
3264 * array reference inside the array of array references, while
3265 * group->n_ref contains the number of element in leader that
3266 * (directly or indirectly) point to this group, provided the group
3267 * is a leader.
3269 static void group_array_references(struct cuda_gen *gen,
3270 struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3272 int i;
3273 int n, n_group;
3274 isl_ctx *ctx = isl_union_map_get_ctx(sched);
3275 struct cuda_array_ref_group **groups;
3276 int *leader;
3278 groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3279 array->n_ref);
3280 assert(groups);
3282 n = populate_array_references(gen, array, sched, groups);
3284 leader = isl_alloc_array(ctx, int, n);
3285 assert(leader);
3287 n_group = group_overlapping_writes(n, groups, leader);
3289 for (i = 0; i < n; ++i)
3290 if (leader[i] == i)
3291 compute_group_shared_bound(gen, array, groups[i]);
3293 n_group = group_common_shared_memory_tile(gen, array, n, groups,
3294 leader, n_group);
3296 extract_array_groups(ctx, array, n, groups, leader, n_group);
3298 free(leader);
3299 free(groups);
3302 /* Take tiled_sched, project it onto the shared tile loops and
3303 * the loops that will be wrapped over the threads,
3304 * parametrize the shared tile loops and store the result in gen->shared_sched.
3305 * The position of the first of these parameters is stored in gen->first_shared.
3306 * Also compute a projection that projects out the loops that will be
3307 * wrapped over the threads and store this projection in gen->shared_proj.
3309 static void compute_shared_sched(struct cuda_gen *gen)
3311 isl_dim *dim;
3312 isl_map *proj;
3313 isl_set *par;
3314 isl_union_map *sched;
3316 sched = isl_union_map_copy(gen->tiled_sched);
3318 dim = isl_union_map_get_dim(sched);
3319 gen->first_shared = isl_dim_size(dim, isl_dim_param);
3320 proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3321 sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3323 dim = isl_union_map_get_dim(sched);
3324 par = parametrization(dim, gen->shared_len + gen->n_block,
3325 0, gen->shared_len, "g");
3326 sched = isl_union_map_intersect_range(sched,
3327 isl_union_set_from_set(par));
3329 dim = isl_union_map_get_dim(sched);
3330 proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3332 gen->shared_sched = sched;
3333 gen->shared_proj = isl_union_map_from_map(proj);
3336 /* Group references of all arrays in the program.
3338 static void group_references(struct cuda_gen *gen)
3340 int i;
3341 isl_union_map *sched;
3343 sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3344 isl_union_map_copy(gen->shared_proj));
3346 for (i = 0; i < gen->n_array; ++i)
3347 group_array_references(gen, &gen->array[i], sched);
3349 isl_union_map_free(sched);
3352 /* Free all array information that is local to the current kernel.
3354 static void free_local_array_info(struct cuda_gen *gen)
3356 int i, j;
3358 for (i = 0; i < gen->n_array; ++i) {
3359 struct cuda_array_info *array = &gen->array[i];
3361 for (j = 0; j < array->n_group; ++j)
3362 free_array_ref_group(array->groups[j], array->n_index);
3363 free(array->groups);
3365 if (array->n_group == 0)
3366 continue;
3367 for (j = 0; j < gen->array[i].n_index; ++j) {
3368 isl_pw_aff_free(gen->array[i].local_bound[j]);
3369 gen->array[i].local_bound[j] = NULL;
3374 static void print_iterator_list(FILE *out, int len, const char *prefix,
3375 int parens)
3377 int i;
3379 fprintf(out, "(");
3380 for (i = 0; i < len; ++i) {
3381 if (i)
3382 fprintf(out, ", ");
3383 if (parens)
3384 fprintf(out, "(%s%d)", prefix, i);
3385 else
3386 fprintf(out, "%s%d", prefix, i);
3388 fprintf(out, ")");
3391 /* Print an access to the element in the global memory copy of the
3392 * given array that corresponds to element [a0][a1]... of the original array.
3393 * The copy in global memory has been linearized, so we need to take
3394 * the array size into account.
3396 static void print_global_index(isl_ctx *ctx, FILE *out,
3397 struct cuda_array_info *array)
3399 int i;
3400 isl_printer *prn;
3402 fprintf(out, "%s[", array->name);
3403 for (i = 0; i + 1 < array->n_index; ++i)
3404 fprintf(out, "(");
3405 for (i = 0; i < array->n_index; ++i) {
3406 if (i) {
3407 prn = isl_printer_to_file(ctx, out);
3408 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3409 prn = isl_printer_print_str(prn, ") * (");
3410 prn = isl_printer_print_pw_aff(prn,
3411 array->local_bound[i]);
3412 prn = isl_printer_print_str(prn, ") + ");
3413 isl_printer_free(prn);
3415 fprintf(out, "a%d", i);
3417 fprintf(out, "]");
3420 /* Print an access to the element in the shared memory copy of the
3421 * given array that corresponds to element [a0][a1]... of the original array.
3422 * Since the array in shared memory is just a shifted copy of part
3423 * of the original array, we simply need to subtract the lower bound,
3424 * which was computed in can_tile_for_shared_memory.
3425 * If any of the indices is strided, then we first add
3426 * shared_bound[i].shift and divide by shared_bound[i].stride.
3428 static void print_local_index(FILE *out, struct cuda_array_ref_group *group)
3430 int i;
3431 isl_ctx *ctx;
3432 isl_printer *prn;
3433 struct cuda_array_bound *bounds = group->shared_bound;
3435 ctx = isl_dim_get_ctx(group->array->dim);
3436 print_array_name(out, group);
3437 for (i = 0; i < group->array->n_index; ++i) {
3438 fprintf(out, "[(a%d", i);
3439 if (bounds[i].shift) {
3440 fprintf(out, " + (");
3441 prn = isl_printer_to_file(ctx, out);
3442 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3443 prn = isl_printer_print_qpolynomial(prn,
3444 bounds[i].shift);
3445 prn = isl_printer_print_str(prn, "))/");
3446 prn = isl_printer_print_isl_int(prn,
3447 bounds[i].stride);
3448 isl_printer_free(prn);
3449 } else
3450 fprintf(out, ")");
3451 fprintf(out, " - (");
3452 prn = isl_printer_to_file(ctx, out);
3453 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3454 prn = isl_printer_print_aff(prn, bounds[i].lb);
3455 isl_printer_free(prn);
3456 fprintf(out, ")]");
3460 /* Print '#define's for copying data from global memory to shared
3461 * memory and back for the given array.
3463 static void print_array_copy_defines(struct cuda_gen *gen,
3464 struct cuda_array_ref_group *group)
3466 int i;
3467 const char *type[] = { "read", "write" };
3468 struct cuda_array_info *array = group->array;
3469 int n_index = array->n_index;
3471 for (i = 0; i < 2; ++i) {
3472 fprintf(gen->cuda.kernel_c, "#define %s_", type[i]);
3473 print_array_name(gen->cuda.kernel_c, group);
3474 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 0);
3475 fprintf(gen->cuda.kernel_c, " %s_", type[i]);
3476 print_array_name(gen->cuda.kernel_c, group);
3477 fprintf(gen->cuda.kernel_c, "_");
3478 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 1);
3479 fprintf(gen->cuda.kernel_c, "\n");
3481 fprintf(gen->cuda.kernel_c, "#define %s_", type[i]);
3482 print_array_name(gen->cuda.kernel_c, group);
3483 fprintf(gen->cuda.kernel_c, "_");
3484 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 0);
3485 if (i) {
3486 fprintf(gen->cuda.kernel_c, " ");
3487 print_global_index(gen->ctx, gen->cuda.kernel_c, array);
3488 fprintf(gen->cuda.kernel_c, " = ");
3489 print_local_index(gen->cuda.kernel_c, group);
3490 } else {
3491 fprintf(gen->cuda.kernel_c, " ");
3492 print_local_index(gen->cuda.kernel_c, group);
3493 fprintf(gen->cuda.kernel_c, " = ");
3494 print_global_index(gen->ctx, gen->cuda.kernel_c, array);
3496 fprintf(gen->cuda.kernel_c, "\n");
3500 static void print_copy_defines(struct cuda_gen *gen)
3502 int i, j;
3504 for (i = 0; i < gen->n_array; ++i) {
3505 struct cuda_array_info *array = &gen->array[i];
3507 for (j = 0; j < array->n_group; ++j) {
3508 if (array->groups[j]->private_bound)
3509 continue;
3510 if (!array->groups[j]->shared_bound)
3511 continue;
3512 print_array_copy_defines(gen, array->groups[j]);
3517 /* The sizes of the arrays on the host that have been computed by
3518 * extract_array_info may depend on the parameters. Use the extra
3519 * constraints on the parameters that are valid at "host_domain"
3520 * to simplify these expressions.
3522 static void localize_bounds(struct cuda_gen *gen,
3523 __isl_keep isl_set *host_domain)
3525 int i, j;
3526 isl_set *context;
3527 unsigned nvar;
3529 context = isl_set_copy(host_domain);
3530 nvar = isl_set_dim(host_domain, isl_dim_set);
3531 context = isl_set_project_out(host_domain, isl_dim_set, 0, nvar);
3533 for (i = 0; i < gen->n_array; ++i) {
3534 struct cuda_array_info *array = &gen->array[i];
3536 if (array->n_group == 0)
3537 continue;
3539 for (j = 0; j < array->n_index; ++j) {
3540 isl_pw_aff *pwaff;
3542 pwaff = isl_pw_aff_copy(array->bound[j]);
3543 pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3544 array->local_bound[j] = pwaff;
3547 isl_set_free(context);
3550 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3551 * in the statement list u.
3552 * Because of the way the schedule is constructed, the other statements
3553 * in the list, if any, should have the same values for these properties.
3555 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3557 int nr;
3558 struct cuda_stmt *stmt;
3560 nr = atoi(u->statement->name + 2);
3561 stmt = &gen->stmts[nr];
3563 gen->tile_len = stmt->tile_len;
3564 gen->n_parallel = stmt->n_parallel;
3567 /* This function is called for each leaf in the clast of the host code.
3568 * We first specialize the schedule to the site of the leaf, compute
3569 * the size of shared memory and then print the body of host code
3570 * and the associated kernel (through a call to print_kernel_body).
3572 static void print_host_user(struct gpucode_info *code,
3573 struct clast_user_stmt *u)
3575 struct cuda_gen *gen = code->user;
3576 isl_dim *dim;
3577 isl_set *par;
3578 isl_set *host_domain;
3579 isl_union_map *access;
3580 isl_union_map *local_sched;
3581 isl_union_set *arrays;
3583 set_tile_len(gen, u);
3584 read_sizes(gen);
3586 host_domain = extract_entire_host_domain(u);
3588 local_sched = isl_union_map_intersect_range(
3589 isl_union_map_copy(gen->sched),
3590 isl_union_set_from_set(extend(isl_set_copy(host_domain),
3591 gen->untiled_len)));
3592 access = isl_union_map_union(isl_union_map_copy(gen->read),
3593 isl_union_map_copy(gen->write));
3594 access = isl_union_map_apply_domain(access,
3595 isl_union_map_copy(local_sched));
3596 arrays = isl_union_map_range(access);
3598 print_indent(code->dst, code->indent);
3599 fprintf(code->dst, "dim3 k%d_dimBlock(", gen->kernel_id);
3600 print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3601 fprintf(code->dst, ");\n");
3603 print_indent(code->dst, code->indent);
3604 fprintf(code->dst, "dim3 k%d_dimGrid(", gen->kernel_id);
3605 print_reverse_list(code->dst, gen->n_grid, gen->grid_dim);
3606 fprintf(code->dst, ");\n");
3608 gen->tiled_sched = tile_schedule(gen, local_sched);
3609 gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3610 gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3612 gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3614 dim = isl_union_map_get_dim(gen->local_sched);
3615 par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3616 gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3617 isl_union_set_from_set(par));
3619 gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3620 gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3622 gen->private_access = NULL;
3623 compute_shared_sched(gen);
3624 gen->privatization = compute_privatization(gen);
3625 group_references(gen);
3626 compute_private_size(gen);
3627 localize_bounds(gen, host_domain);
3629 gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3631 print_copy_defines(gen);
3632 print_kernel_launch(gen, arrays);
3634 fprintf(gen->cuda.kernel_c, "{\n");
3636 print_kernel_body(gen, host_domain, gen->tiled_sched);
3638 fprintf(gen->cuda.kernel_c, "}\n");
3640 free_local_array_info(gen);
3641 isl_map_free(gen->privatization);
3642 isl_union_map_free(gen->private_access);
3643 isl_union_map_free(gen->local_sched);
3644 isl_union_map_free(gen->tiled_sched);
3645 isl_union_map_free(gen->shared_sched);
3646 isl_union_map_free(gen->shared_proj);
3647 isl_union_set_free(arrays);
3648 isl_set_free(host_domain);
3650 free(gen->tile_size);
3651 gen->kernel_id++;
3654 /* Use CLooG to generate code for the outer gen->tile_first loops
3655 * of the global schedule in gen->sched.
3656 * The pretty printing of this code is handled by gpu_print_host_stmt,
3657 * which calls print_host_user for each kernel invocation location.
3659 static void print_cloog_host_code(struct cuda_gen *gen)
3661 int i;
3662 isl_set *context;
3663 isl_union_map *sched;
3664 CloogOptions *options;
3665 CloogDomain *cloog_context;
3666 CloogUnionDomain *ud;
3667 CloogInput *input;
3668 struct clast_stmt *stmt;
3669 char name[20];
3671 options = cloog_options_malloc(gen->state);
3672 options->language = LANGUAGE_C;
3673 options->otl = 0;
3674 options->strides = 1;
3675 options->stop = gen->tile_first;
3676 options->f = gen->untiled_len;
3677 options->l = gen->untiled_len;
3678 options->save_domains = 1;
3679 options->noscalars = 1;
3681 sched = isl_union_map_copy(gen->sched);
3682 ud = cloog_union_domain_from_isl_union_map(sched);
3683 for (i = 0; i < options->stop; ++i) {
3684 snprintf(name, sizeof(name), "h%d", i);
3685 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3687 context = isl_set_copy(gen->context);
3688 cloog_context = cloog_domain_from_isl_set(context);
3689 input = cloog_input_alloc(cloog_context, ud);
3691 stmt = cloog_clast_create_from_input(input, options);
3693 gen->code.indent = 0;
3694 gen->code.dst = gen->cuda.host_c;
3695 gen->code.print_user_stmt = NULL;
3696 gen->code.print_user_stmt_list = &print_host_user;
3697 gen->code.print_for_head = NULL;
3698 gen->code.print_for_foot = NULL;
3699 gen->code.user = gen;
3700 gpu_print_host_stmt(&gen->code, stmt);
3702 cloog_clast_free(stmt);
3703 cloog_options_free(options);
3706 void print_host_code(struct cuda_gen *gen)
3708 fprintf(gen->cuda.host_c, "{\n");
3709 print_cloog_macros(gen->cuda.host_c);
3710 print_cloog_macros(gen->cuda.kernel_c);
3712 declare_device_arrays(gen);
3714 allocate_device_arrays(gen);
3715 copy_arrays_to_device(gen);
3717 gen->kernel_id = 0;
3718 print_cloog_host_code(gen);
3720 copy_arrays_from_device(gen);
3721 free_device_arrays(gen);
3723 fprintf(gen->cuda.host_c, "}\n");
3726 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
3727 const char *str)
3729 isl_ctx *ctx;
3730 isl_set *context;
3732 if (!str)
3733 return set;
3735 ctx = isl_set_get_ctx(set);
3736 context = isl_set_read_from_str(ctx, str, -1);
3737 context = isl_set_align_params(context, isl_set_get_dim(set));
3738 set = isl_set_intersect(set, context);
3740 return set;
3743 /* Return the union of all iteration domains of the gen->stmts[i].
3745 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
3747 int i;
3748 isl_union_set *domain;
3750 domain = isl_union_set_empty(isl_set_get_dim(gen->context));
3751 for (i = 0; i < gen->n_stmts; ++i) {
3752 isl_set *domain_i;
3754 domain_i = isl_set_copy(gen->stmts[i].domain);
3755 domain = isl_union_set_union(domain,
3756 isl_union_set_from_set(domain_i));
3759 return domain;
3762 /* Information about the outermost tilable bands in the forest of bands.
3764 * tile_len and n_parallel are only sets on band_info structures
3765 * that correspond to outermost bands. For other bands (in particular,
3766 * ancestors of the outermost bands), n_parallal is set to 0.
3768 * prefix is the (padded) schedule leading up to the outermost tilable bands.
3770 * tile_first is the number of schedule dimensions in prefix.
3772 * suffix is the schedule of the outermost tilable bands and their descendants.
3774 struct band_info {
3775 struct cuda_gen *gen;
3776 int tile_first;
3777 int tile_len;
3778 int n_parallel;
3779 isl_union_map *prefix;
3780 isl_union_map *suffix;
3783 /* Set tile_len and n_parallel of the statement to that of
3784 * their outermost band, recorded in the band_info.
3786 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
3788 struct band_info *info = user;
3789 int nr;
3790 struct cuda_stmt *stmt;
3792 nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
3793 stmt = &info->gen->stmts[nr];
3795 stmt->tile_len = info->tile_len;
3796 stmt->n_parallel = info->n_parallel;
3798 isl_map_free(map);
3800 return 0;
3803 static void list_select_outer_band(struct cuda_gen *gen,
3804 __isl_take isl_band_list *list, int pos, struct band_info *list_info);
3806 /* Check if this band has any parallel loops. If so, take it as
3807 * the outermost tilable band. If not, continue looking for the
3808 * outermost tilable band in the children of the current band.
3810 static void band_select_outer_band(struct cuda_gen *gen,
3811 __isl_take isl_band *band, int pos, struct band_info *info)
3813 int n = isl_band_n_member(band);
3814 int n_parallel;
3816 for (n_parallel = 0; n_parallel < n; ++n_parallel)
3817 if (!isl_band_member_is_zero_distance(band, n_parallel))
3818 break;
3820 info->n_parallel = n_parallel;
3821 if (n_parallel) {
3822 info->gen = gen;
3823 info->tile_first = pos;
3824 info->tile_len = n;
3825 info->prefix = isl_band_get_prefix_schedule(band);
3826 info->suffix = isl_union_map_flat_range_product(
3827 isl_band_get_partial_schedule(band),
3828 isl_band_get_suffix_schedule(band));
3829 isl_union_map_foreach_map(info->prefix,
3830 &set_stmt_tile_len, info);
3831 } else {
3832 isl_band_list *children;
3833 if (!isl_band_has_children(band))
3834 isl_die(isl_band_get_ctx(band), isl_error_unknown,
3835 "unable to detect any parallelism", abort());
3836 children = isl_band_get_children(band);
3837 list_select_outer_band(gen, children, pos + n, info);
3840 isl_band_free(band);
3843 /* Comparison function that returns a non-zero value for band_infos
3844 * with different tile_len fields or different n_parallel fields.
3846 static int cmp_band(const void *p1, const void *p2)
3848 const struct band_info *info1 = p1;
3849 const struct band_info *info2 = p2;
3851 if (info1->tile_len != info2->tile_len)
3852 return info1->tile_len - info2->tile_len;
3854 return info1->n_parallel - info2->n_parallel;
3857 /* Extend "umap" with coordinates with fixed value "val"
3858 * to a total length of "dst_len", assuming the original dimension is "src_len".
3860 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
3861 int src_len, int dst_len, int val)
3863 isl_dim *dim;
3864 isl_map *map;
3865 int i;
3867 dim = isl_union_map_get_dim(umap);
3868 map = isl_map_reverse(projection(dim, dst_len, src_len));
3869 for (i = src_len; i < dst_len; ++i)
3870 map = isl_map_fix_si(map, isl_dim_out, i, val);
3872 umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
3874 return umap;
3877 /* Group bands with the same values for tile_len and n_parallel.
3878 * The prefix schedule is then extended with a fixed coordinate that
3879 * is different for each such group.
3880 * Note that the actual values for this coordinate are not important.
3881 * The bands have already been effectively separated at a higher level
3882 * or they are independent and may be executed in parallel.
3883 * The list of band_info has been sorted before this functions is called.
3885 static void separate_bands(struct band_info *info, int n)
3887 int i;
3888 int j = 0;
3890 for (i = 0; i < n; ++i) {
3891 int l = info[i].tile_first;
3893 if (i &&
3894 (info[i].tile_len != info[i - 1].tile_len ||
3895 info[i].n_parallel != info[i - 1].n_parallel))
3896 j++;
3898 info[i].prefix = extend_range(info[i].prefix,
3899 l, l + 1, j);
3900 info[i].tile_first = l + 1;
3904 /* Select the outermost bands in the elements of the list, align
3905 * their prefix schedules, separate bands with different values
3906 * for tile_len and/or n_parallel and then combine the resulting
3907 * prefix and suffix schedules into a single pair of prefix and
3908 * suffix schedules for the entire list.
3910 static void list_select_outer_band(struct cuda_gen *gen,
3911 __isl_take isl_band_list *list, int pos, struct band_info *list_info)
3913 isl_band *band;
3914 int i;
3915 int n = isl_band_list_n_band(list);
3916 isl_ctx *ctx = isl_band_list_get_ctx(list);
3917 struct band_info *info;
3918 int max_tile_first;
3919 isl_union_map *prefix;
3920 isl_union_map *suffix;
3922 assert(n >= 1);
3923 info = isl_calloc_array(ctx, struct band_info, n);
3924 assert(info);
3926 max_tile_first = 0;
3927 for (i = 0; i < n; ++i) {
3928 band = isl_band_list_get_band(list, i);
3929 band_select_outer_band(gen, band, pos, &info[i]);
3930 if (info[i].tile_first > max_tile_first)
3931 max_tile_first = info[i].tile_first;
3934 for (i = 0; i < n; ++i) {
3935 if (info[i].tile_first == max_tile_first)
3936 continue;
3937 info[i].prefix = extend_range(info[i].prefix,
3938 info[i].tile_first, max_tile_first, 0);
3941 qsort(info, n, sizeof(struct band_info), &cmp_band);
3943 for (i = 0; i < n - 1; ++i)
3944 if (info[i].tile_len != info[i + 1].tile_len ||
3945 info[i].n_parallel != info[i + 1].n_parallel)
3946 break;
3948 if (i < n -1)
3949 separate_bands(info, n);
3951 prefix = info[0].prefix;
3952 suffix = info[0].suffix;
3954 for (i = 1; i < n; ++i) {
3955 prefix = isl_union_map_union(prefix, info[i].prefix);
3956 suffix = isl_union_map_union(suffix, info[i].suffix);
3959 list_info->tile_first = info[0].tile_first;
3960 list_info->tile_len = -1;
3961 list_info->prefix = prefix;
3962 list_info->suffix = suffix;
3964 isl_band_list_free(list);
3965 free(info);
3968 /* Set max_out to the maximal number of output dimensions over
3969 * all maps.
3971 static int update_max_out(__isl_take isl_map *map, void *user)
3973 int *max_out = user;
3974 int n_out = isl_map_dim(map, isl_dim_out);
3976 if (n_out > *max_out)
3977 *max_out = n_out;
3979 isl_map_free(map);
3980 return 0;
3983 struct align_range_data {
3984 int max_out;
3985 isl_union_map *res;
3988 /* Extend the dimension of the range of the given map to data->max_out and
3989 * then add the result to data->res.
3991 static int map_align_range(__isl_take isl_map *map, void *user)
3993 struct align_range_data *data = user;
3994 int i;
3995 isl_dim *dim;
3996 isl_map *proj;
3997 int n_out = isl_map_dim(map, isl_dim_out);
3999 dim = isl_union_map_get_dim(data->res);
4000 proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4001 for (i = n_out; i < data->max_out; ++i)
4002 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4004 map = isl_map_apply_range(map, proj);
4006 data->res = isl_union_map_add_map(data->res, map);
4008 return 0;
4011 /* Extend the ranges of the maps in the union map such they all have
4012 * the same dimension.
4014 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4016 struct align_range_data data;
4018 data.max_out = 0;
4019 isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4021 data.res = isl_union_map_empty(isl_union_map_get_dim(umap));
4022 isl_union_map_foreach_map(umap, &map_align_range, &data);
4024 isl_union_map_free(umap);
4025 return data.res;
4028 /* Select the outermost tilable band that (by construction)
4029 * has at least one parallel loop.
4030 * The starting position of the aligned band is stored in the pair
4031 * gen->tile_first.
4032 * The sizes and number of parallel loops may be different in different
4033 * parts of the band forest and are therefore stored in the cuda_stmts.
4035 * Return the complete schedule, with the tilable bands aligned
4036 * at gen->tile_first and padded with zero, if needed.
4038 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4039 __isl_keep isl_schedule *schedule)
4041 isl_band_list *list;
4042 struct band_info info;
4044 gen->n_parallel = 0;
4045 gen->tile_len = -1;
4047 list = isl_schedule_get_band_forest(schedule);
4049 list_select_outer_band(gen, list, 0, &info);
4051 gen->tile_first = info.tile_first;
4052 info.suffix = align_range(info.suffix);
4054 return isl_union_map_flat_range_product(info.prefix, info.suffix);
4057 /* Set gen->untiled_len to the number of scheduling dimensions
4058 * for the schedule of the first domain.
4059 * We assume here that this number is the same for all domains.
4061 static int set_untiled_len(__isl_take isl_map *map, void *user)
4063 unsigned *untiled_len = user;
4065 *untiled_len = isl_map_dim(map, isl_dim_out);
4067 isl_map_free(map);
4068 return -1;
4071 /* Compute an appropriate schedule based on the accesses in
4072 * gen->read and gen->write.
4074 * We first compute dependences and then use those to compute
4075 * a schedule that has a parallel loop in each tilable band.
4076 * Finally, we select the outermost tilable band.
4078 static void compute_schedule(struct cuda_gen *gen,
4079 __isl_take isl_union_map *sched)
4081 isl_ctx *ctx = isl_union_map_get_ctx(sched);
4082 isl_union_set *domain;
4083 isl_union_map *empty;
4084 isl_union_map *dep_raw, *dep2, *dep3, *dep;
4085 isl_union_map *uninitialized;
4086 isl_schedule *schedule;
4087 struct isl_options *options;
4089 empty = isl_union_map_empty(isl_union_map_get_dim(sched));
4091 isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4092 isl_union_map_copy(gen->write), empty,
4093 isl_union_map_copy(sched),
4094 &dep_raw, NULL, &uninitialized, NULL);
4095 isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4096 isl_union_map_copy(gen->write),
4097 isl_union_map_copy(gen->read),
4098 isl_union_map_copy(sched),
4099 &dep2, &dep3, NULL, NULL);
4100 isl_union_map_free(sched);
4102 gen->copy_in = isl_union_map_range(uninitialized);
4104 dep = isl_union_map_union(dep2, dep3);
4105 dep = isl_union_map_union(dep, dep_raw);
4106 dep = isl_union_map_coalesce(dep);
4108 domain = extract_domain(gen);
4109 options = isl_ctx_peek_options(ctx, isl_options_arg);
4110 options->schedule_outer_zero_distance = 1;
4111 schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4112 isl_union_map_copy(dep), dep);
4114 sched = select_outer_tilable_band(gen, schedule);
4116 isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4117 sched = isl_union_map_intersect_domain(sched, domain);
4118 gen->sched = sched;
4120 isl_schedule_free(schedule);
4123 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4124 struct cuda_stmt_access **next_access)
4126 struct cuda_stmt_access *access;
4127 isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4129 access = isl_alloc_type(ctx, struct cuda_stmt_access);
4130 assert(access);
4131 access->next = NULL;
4132 access->read = expr->acc.read;
4133 access->write = expr->acc.write;
4134 access->access = isl_map_copy(expr->acc.access);
4136 *next_access = access;
4137 next_access = &(*next_access)->next;
4138 return next_access;
4141 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4142 struct cuda_stmt_access **next_access)
4144 int i;
4146 for (i = 0; i < expr->n_arg; ++i)
4147 next_access = expr_extract_accesses(expr->args[i],
4148 next_access);
4150 if (expr->type == pet_expr_access)
4151 next_access = expr_extract_access(expr, next_access);
4153 return next_access;
4156 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4158 struct cuda_stmt_access **next_access = &stmt->accesses;
4160 stmt->accesses = NULL;
4161 expr_extract_accesses(stmt->body, next_access);
4164 /* Return an array of cuda_stmt representing the statements in "scop".
4166 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4167 __isl_keep isl_set *context)
4169 int i;
4170 struct cuda_stmt *stmts;
4172 stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4173 assert(stmts);
4175 for (i = 0; i < scop->n_stmt; ++i) {
4176 struct cuda_stmt *s = &stmts[i];
4178 s->domain = isl_set_copy(scop->stmts[i]->domain);
4179 s->domain = isl_set_intersect(s->domain, isl_set_copy(context));
4180 s->body = scop->stmts[i]->body;
4181 pet_stmt_extract_accesses(s);
4184 return stmts;
4187 /* Replace the scop in the "input" file by equivalent code
4188 * that uses the GPU. "scop" is assumed to correspond to this scop.
4190 * We first compute a schedule that respects the dependences
4191 * of the original program and select the outermost band
4192 * of tilable dimensions that has at least one parallel loop.
4193 * We then have three blocks of dimensions
4195 * H B G
4197 * The tilable band "B" is first tiled according to "tile.sizes", resulting
4198 * in
4200 * H T P G
4202 * For each iteration of the T loop and for each array, we compute
4203 * the array elements accessed by that iteration, construct a rectangular
4204 * box around it and shift it to the origin. The result is used
4205 * as shared memory for the array.
4207 * We then split off at most 2 parallel loops from the T loops and
4208 * at most 3 parallel loops from the P loops
4210 * H T1 T2 P1 P2 G
4212 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4213 * according to "grid.sizes"/"block.sizes".
4215 * H T1T T1P T2 P1T P1P P2 G
4217 * Finally, the T1P and P1P iterators are equated to the block and
4218 * thread dimensions respectively and so are effectively removed.
4219 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4220 * are run on the GPU.
4222 * Code is generated in three stages. We first generate code for the
4223 * host (the H loops), with iterators h%d. Then, for each leaf node
4224 * of the resulting AST, we generate code for the shared loops (up to
4225 * and including T2), with iterators g%d and after equating the H loops
4226 * to h%d parameters and the T1P loops to the block dimensions.
4227 * Finally, we generate code for the remaining loops in a similar fashion.
4229 int cuda_pet(isl_ctx *ctx, struct pet_scop *scop, struct ppcg_options *options,
4230 const char *input)
4232 isl_union_map *sched;
4233 struct cuda_gen gen;
4235 if (!scop)
4236 return -1;
4238 scop = pet_scop_align_params(scop);
4240 gen.ctx = ctx;
4241 gen.context = isl_set_copy(scop->context);
4242 gen.context = add_context_from_str(gen.context, options->ctx);
4243 gen.n_stmts = scop->n_stmt;
4244 gen.stmts = extract_stmts(ctx, scop, gen.context);
4245 gen.read = pet_scop_collect_reads(scop);
4246 gen.write = pet_scop_collect_writes(scop);
4247 gen.options = options;
4248 gen.state = cloog_isl_state_malloc(gen.ctx);
4249 gen.scop = scop;
4251 cuda_open_files(&gen.cuda, input);
4253 collect_array_info(&gen);
4255 sched = pet_scop_collect_schedule(scop);
4257 compute_schedule(&gen, sched);
4259 print_host_code(&gen);
4261 cloog_state_free(gen.state);
4262 clear_cuda_gen(&gen);
4264 cuda_close_files(&gen.cuda);
4266 return 0;