2 * Copyright 2010-2011 INRIA Saclay
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
7 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
14 #include <isl/polynomial.h>
15 #include <isl/union_set.h>
20 #include <isl/schedule.h>
21 #include <isl/options.h>
22 #include <cloog/isl/cloog.h>
25 #include "cuda_common.h"
28 #include "scoplib_isl.h"
29 #include "ppcg_options.h"
31 /* The fields stride, shift and shift_map only contain valid information
33 * If so, they express that current index is such that if you add shift,
34 * then the result is always a multiple of stride.
35 * shift_map contains the mapping
37 * i -> (i + shift)/stride
39 struct cuda_array_bound
{
44 isl_qpolynomial
*shift
;
45 isl_basic_map
*shift_map
;
48 struct cuda_array_info
;
50 /* A group of array references in a kernel that should be handled together.
51 * If private_bound is not NULL, then it is mapped to registers.
52 * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
53 * Otherwise, it is accesses from global memory.
55 struct cuda_array_ref_group
{
56 /* The references in this group access this array. */
57 struct cuda_array_info
*array
;
58 /* Position of this group in the list of reference groups of array. */
61 /* The following fields are use during the construction of the groups.
62 * access is the combined access relation relative to the shared
64 * write is set if any access in the group is a write.
69 /* For each index, size and offset of piece in shared memory. */
70 struct cuda_array_bound
*shared_bound
;
72 /* For each index, size and offset of piece in private memory. */
73 struct cuda_array_bound
*private_bound
;
75 /* References in this group; point to elements of a linked list. */
77 struct cuda_stmt_access
**refs
;
80 struct cuda_array_info
{
82 /* Name of the array. */
84 /* Number of indices. */
86 /* For each index, a bound on the array in that direction. */
88 /* For each index, bound[i] specialized to the current kernel. */
89 isl_pw_aff
**local_bound
;
91 /* All references to this array; point to elements of a linked list. */
93 struct cuda_stmt_access
**refs
;
95 /* The reference groups associated to this array. */
97 struct cuda_array_ref_group
**groups
;
99 /* Last shared memory tile dimension that affects tile of this array. */
101 /* Dimension at which copying to/from shared memory is printed.
102 * if >= 0, then the value is >= last_shared
103 * if -1, then the copying is done at the leaf level.
105 int print_shared_level
;
108 /* Print the name of the local copy of a given group of array references.
110 static void print_array_name(FILE *out
, struct cuda_array_ref_group
*group
)
114 if (group
->private_bound
)
115 fprintf(out
, "private_");
116 else if (group
->shared_bound
)
117 fprintf(out
, "shared_");
120 fprintf(out
, "%s", group
->array
->name
);
121 if (!global
&& group
->array
->n_group
> 1)
122 fprintf(out
, "_%d", group
->nr
);
125 /* Collect all references to the given array and store pointers to them
128 static void collect_references(struct cuda_gen
*gen
,
129 struct cuda_array_info
*array
)
135 for (i
= 0; i
< gen
->n_stmts
; ++i
) {
136 struct cuda_stmt
*stmt
= &gen
->stmts
[i
];
137 struct cuda_stmt_access
*access
;
139 for (access
= stmt
->accesses
; access
; access
= access
->next
) {
141 name
= isl_map_get_tuple_name(access
->access
,
143 if (name
&& !strcmp(array
->name
, name
))
149 array
->refs
= isl_alloc_array(gen
->ctx
, struct cuda_stmt_access
*, n
);
153 for (i
= 0; i
< gen
->n_stmts
; ++i
) {
154 struct cuda_stmt
*stmt
= &gen
->stmts
[i
];
155 struct cuda_stmt_access
*access
;
157 for (access
= stmt
->accesses
; access
; access
= access
->next
) {
159 name
= isl_map_get_tuple_name(access
->access
,
161 if (!name
|| strcmp(array
->name
, name
))
164 array
->refs
[n
++] = access
;
169 static struct cuda_array_bound
*create_bound_list(isl_ctx
*ctx
, int n_index
)
172 struct cuda_array_bound
*bound
;
174 bound
= isl_alloc_array(ctx
, struct cuda_array_bound
, n_index
);
177 for (i
= 0; i
< n_index
; ++i
) {
178 isl_int_init(bound
[i
].size
);
180 isl_int_init(bound
[i
].stride
);
181 bound
[i
].shift
= NULL
;
182 bound
[i
].shift_map
= NULL
;
188 static void free_bound_list(struct cuda_array_bound
*bound
, int n_index
)
195 for (j
= 0; j
< n_index
; ++j
) {
196 isl_int_clear(bound
[j
].size
);
197 isl_int_clear(bound
[j
].stride
);
198 isl_aff_free(bound
[j
].lb
);
199 isl_qpolynomial_free(bound
[j
].shift
);
200 isl_basic_map_free(bound
[j
].shift_map
);
205 /* Compute bounds on the host arrays based on the accessed elements
206 * and collect all references to the array.
208 static int extract_array_info(__isl_take isl_set
*array
, void *user
)
211 struct cuda_gen
*gen
= (struct cuda_gen
*)user
;
215 isl_pw_aff
**local_bounds
;
217 n_index
= isl_set_dim(array
, isl_dim_set
);
218 name
= isl_set_get_tuple_name(array
);
219 bounds
= isl_alloc_array(isl_set_get_ctx(array
),
220 isl_pw_aff
*, n_index
);
222 local_bounds
= isl_calloc_array(isl_set_get_ctx(array
),
223 isl_pw_aff
*, n_index
);
224 assert(local_bounds
);
225 gen
->array
[gen
->n_array
].dim
= isl_set_get_dim(array
);
226 gen
->array
[gen
->n_array
].name
= strdup(name
);
227 gen
->array
[gen
->n_array
].n_index
= n_index
;
228 gen
->array
[gen
->n_array
].bound
= bounds
;
229 gen
->array
[gen
->n_array
].local_bound
= local_bounds
;
231 for (i
= 0; i
< n_index
; ++i
) {
237 bound
= isl_set_dim_max(isl_set_copy(array
), i
);
238 dom
= isl_pw_aff_domain(isl_pw_aff_copy(bound
));
239 ls
= isl_local_space_from_dim(isl_set_get_dim(dom
));
240 one
= isl_aff_zero(ls
);
241 one
= isl_aff_add_constant_si(one
, 1);
242 bound
= isl_pw_aff_add(bound
, isl_pw_aff_alloc(dom
, one
));
243 bound
= isl_pw_aff_gist(bound
, isl_set_copy(gen
->context
));
248 collect_references(gen
, &gen
->array
[gen
->n_array
]);
256 void collect_array_info(struct cuda_gen
*gen
)
258 isl_union_set
*arrays
;
260 arrays
= isl_union_map_range(isl_union_map_copy(gen
->read
));
261 arrays
= isl_union_set_union(arrays
,
262 isl_union_map_range(isl_union_map_copy(gen
->write
)));
263 arrays
= isl_union_set_coalesce(arrays
);
265 gen
->n_array
= isl_union_set_n_set(arrays
);
266 gen
->array
= isl_alloc_array(gen
->ctx
,
267 struct cuda_array_info
, gen
->n_array
);
270 isl_union_set_foreach_set(arrays
, &extract_array_info
, gen
);
271 isl_union_set_free(arrays
);
274 static void free_array_info(struct cuda_gen
*gen
)
278 for (i
= 0; i
< gen
->n_array
; ++i
) {
279 int n_index
= gen
->array
[i
].n_index
;
280 free(gen
->array
[i
].name
);
281 for (j
= 0; j
< n_index
; ++j
) {
282 isl_pw_aff_free(gen
->array
[i
].bound
[j
]);
283 isl_pw_aff_free(gen
->array
[i
].local_bound
[j
]);
285 isl_dim_free(gen
->array
[i
].dim
);
286 free(gen
->array
[i
].bound
);
287 free(gen
->array
[i
].local_bound
);
288 free(gen
->array
[i
].refs
);
293 static void declare_device_arrays(struct cuda_gen
*gen
)
297 for (i
= 0; i
< gen
->n_array
; ++i
)
298 fprintf(gen
->cuda
.host_c
, "%s *dev_%s;\n",
299 gen
->options
->type
, gen
->array
[i
].name
);
302 static void print_array_size(struct cuda_gen
*gen
, FILE *out
,
303 struct cuda_array_info
*array
)
308 prn
= isl_printer_to_file(gen
->ctx
, out
);
309 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
310 for (i
= 0; i
< array
->n_index
; ++i
) {
311 prn
= isl_printer_print_str(prn
, "(");
312 prn
= isl_printer_print_pw_aff(prn
, array
->bound
[i
]);
313 prn
= isl_printer_print_str(prn
, ") * ");
315 prn
= isl_printer_print_str(prn
, "sizeof(");
316 prn
= isl_printer_print_str(prn
, gen
->options
->type
);
317 prn
= isl_printer_print_str(prn
, ")");
318 isl_printer_free(prn
);
321 static void allocate_device_arrays(struct cuda_gen
*gen
)
325 for (i
= 0; i
< gen
->n_array
; ++i
) {
326 fprintf(gen
->cuda
.host_c
, "cudaMalloc(&dev_%s, ",
328 print_array_size(gen
, gen
->cuda
.host_c
, &gen
->array
[i
]);
329 fprintf(gen
->cuda
.host_c
, ");\n");
333 static void free_device_arrays(struct cuda_gen
*gen
)
337 for (i
= 0; i
< gen
->n_array
; ++i
)
338 fprintf(gen
->cuda
.host_c
, "cudaFree(dev_%s);\n",
342 static void copy_arrays_to_device(struct cuda_gen
*gen
)
346 for (i
= 0; i
< gen
->n_array
; ++i
) {
351 dim
= isl_dim_copy(gen
->array
[i
].dim
);
352 read_i
= isl_union_set_extract_set(gen
->copy_in
, dim
);
353 empty
= isl_set_fast_is_empty(read_i
);
354 isl_set_free(read_i
);
358 fprintf(gen
->cuda
.host_c
, "assert(sizeof(%s) == ",
360 print_array_size(gen
, gen
->cuda
.host_c
, &gen
->array
[i
]);
361 fprintf(gen
->cuda
.host_c
, ");\n");
362 fprintf(gen
->cuda
.host_c
, "cudaMemcpy(dev_%s, %s, ",
363 gen
->array
[i
].name
, gen
->array
[i
].name
);
364 print_array_size(gen
, gen
->cuda
.host_c
, &gen
->array
[i
]);
365 fprintf(gen
->cuda
.host_c
, ", cudaMemcpyHostToDevice);\n");
369 static void copy_arrays_from_device(struct cuda_gen
*gen
)
372 isl_union_set
*write
;
373 write
= isl_union_map_range(isl_union_map_copy(gen
->write
));
375 for (i
= 0; i
< gen
->n_array
; ++i
) {
380 dim
= isl_dim_copy(gen
->array
[i
].dim
);
381 write_i
= isl_union_set_extract_set(write
, dim
);
382 empty
= isl_set_fast_is_empty(write_i
);
383 isl_set_free(write_i
);
387 fprintf(gen
->cuda
.host_c
, "cudaMemcpy(%s, dev_%s, ",
388 gen
->array
[i
].name
, gen
->array
[i
].name
);
389 print_array_size(gen
, gen
->cuda
.host_c
, &gen
->array
[i
]);
390 fprintf(gen
->cuda
.host_c
, ", cudaMemcpyDeviceToHost);\n");
393 isl_union_set_free(write
);
396 static void read_sizes_from_file(struct cuda_gen
*gen
, const char *filename
,
402 file
= fopen(filename
, "r");
406 for (i
= 0; i
< len
; ++i
)
407 if (fscanf(file
, "%d", &sizes
[i
]) < 1)
413 static void reverse_list(int *list
, int len
)
418 for (i
= 0; 2 * i
< len
; ++i
) {
420 list
[i
] = list
[len
- 1 - i
];
421 list
[len
- 1 - i
] = t
;
425 /* Read user specified sizes from "tile.sizes", "block.sizes" and "grid.sizes"
426 * after filling in some potentially useful defaults.
428 static void read_sizes(struct cuda_gen
*gen
)
432 gen
->tile_size
= isl_alloc_array(gen
->ctx
, int, gen
->tile_len
);
433 assert(gen
->tile_size
);
434 for (n
= 0; n
< gen
->tile_len
; ++n
)
435 gen
->tile_size
[n
] = gen
->options
->tile_size
;
436 read_sizes_from_file(gen
, "tile.sizes", gen
->tile_size
, gen
->tile_len
);
439 gen
->n_block
= (n
<= 3) ? n
: 3;
440 switch (gen
->n_block
) {
442 gen
->block_dim
[0] = 512;
445 gen
->block_dim
[0] = 32;
446 gen
->block_dim
[1] = 16;
449 gen
->block_dim
[0] = 32;
450 gen
->block_dim
[1] = 4;
451 gen
->block_dim
[2] = 4;
454 read_sizes_from_file(gen
, "block.sizes", gen
->block_dim
, gen
->n_block
);
455 reverse_list(gen
->block_dim
, gen
->n_block
);
457 gen
->n_grid
= (n
<= 2) ? n
: 2;
458 switch (gen
->n_grid
) {
460 gen
->grid_dim
[0] = 65536;
463 gen
->grid_dim
[0] = 256;
464 gen
->grid_dim
[1] = 256;
467 read_sizes_from_file(gen
, "grid.sizes", gen
->grid_dim
, gen
->n_grid
);
468 reverse_list(gen
->grid_dim
, gen
->n_grid
);
471 static void free_stmts(struct cuda_stmt
*stmts
, int n
)
475 for (i
= 0; i
< n
; ++i
) {
476 struct cuda_stmt_access
*access
, *next
;
478 for (access
= stmts
[i
].accesses
; access
; access
= next
) {
480 isl_map_free(access
->access
);
484 isl_set_free(stmts
[i
].domain
);
490 void clear_cuda_gen(struct cuda_gen
*gen
)
492 free_stmts(gen
->stmts
, gen
->n_stmts
);
493 free_array_info(gen
);
494 isl_set_free(gen
->context
);
495 isl_union_set_free(gen
->copy_in
);
496 isl_union_map_free(gen
->sched
);
497 isl_union_map_free(gen
->read
);
498 isl_union_map_free(gen
->write
);
501 static void print_reverse_list(FILE *out
, int len
, int *list
)
505 for (i
= 0; i
< len
; ++i
) {
508 fprintf(out
, "%d", list
[len
- 1 - i
]);
512 static void print_kernel_launch(struct cuda_gen
*gen
,
513 __isl_keep isl_union_set
*arrays
)
520 print_indent(gen
->code
.dst
, gen
->code
.indent
);
521 fprintf(gen
->code
.dst
, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
522 gen
->kernel_id
, gen
->kernel_id
, gen
->kernel_id
);
523 fprintf(gen
->cuda
.kernel_c
, "__global__ void kernel%d(",
525 fprintf(gen
->cuda
.kernel_h
, "__global__ void kernel%d(",
528 for (i
= 0; i
< gen
->n_array
; ++i
) {
533 dim
= isl_dim_copy(gen
->array
[i
].dim
);
534 arr
= isl_union_set_extract_set(arrays
, dim
);
535 empty
= isl_set_fast_is_empty(arr
);
541 fprintf(gen
->code
.dst
, ", ");
542 fprintf(gen
->cuda
.kernel_c
, ", ");
543 fprintf(gen
->cuda
.kernel_h
, ", ");
546 fprintf(gen
->code
.dst
, "dev_%s", gen
->array
[i
].name
);
547 fprintf(gen
->cuda
.kernel_c
, "%s *%s",
548 gen
->options
->type
, gen
->array
[i
].name
);
549 fprintf(gen
->cuda
.kernel_h
, "%s *%s",
550 gen
->options
->type
, gen
->array
[i
].name
);
555 dim
= isl_union_set_get_dim(arrays
);
556 nparam
= isl_dim_size(dim
, isl_dim_param
);
557 for (i
= 0; i
< nparam
; ++i
) {
558 const char *name
= isl_dim_get_name(dim
, isl_dim_param
, i
);
560 fprintf(gen
->code
.dst
, ", ");
561 fprintf(gen
->cuda
.kernel_c
, ", ");
562 fprintf(gen
->cuda
.kernel_h
, ", ");
564 fprintf(gen
->code
.dst
, "%s", name
);
565 fprintf(gen
->cuda
.kernel_c
, "int %s", name
);
566 fprintf(gen
->cuda
.kernel_h
, "int %s", name
);
571 for (i
= 0; i
< gen
->tile_first
; ++i
) {
573 fprintf(gen
->code
.dst
, ", ");
574 fprintf(gen
->cuda
.kernel_c
, ", ");
575 fprintf(gen
->cuda
.kernel_h
, ", ");
577 fprintf(gen
->code
.dst
, "h%d", i
);
578 fprintf(gen
->cuda
.kernel_c
, "int h%d", i
);
579 fprintf(gen
->cuda
.kernel_h
, "int h%d", i
);
583 fprintf(gen
->code
.dst
, ");\n");
584 fprintf(gen
->cuda
.kernel_c
, ")\n");
585 fprintf(gen
->cuda
.kernel_h
, ");\n");
588 /* Construct a map from a domain of dimensionality "len"
589 * to a domain of dimensionality "len" + "tile_len" that tiles
590 * the "tile_len" coordinates starting at "first".
591 * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
592 * "dim" prescribes the parameters.
594 static __isl_give isl_map
*tile(__isl_take isl_dim
*dim
, int len
,
595 int first
, int tile_len
, int *tile_size
)
604 dim
= isl_dim_add(dim
, isl_dim_in
, len
);
605 dim
= isl_dim_add(dim
, isl_dim_out
, len
+ tile_len
);
606 bmap
= isl_basic_map_universe(isl_dim_copy(dim
));
608 for (i
= 0; i
< len
- tile_len
; ++i
) {
609 int j
= i
< first
? i
: i
+ tile_len
;
610 int k
= i
< first
? i
: i
+ 2 * tile_len
;
612 c
= isl_equality_alloc(isl_dim_copy(dim
));
613 isl_int_set_si(v
, -1);
614 isl_constraint_set_coefficient(c
, isl_dim_in
, j
, v
);
615 isl_int_set_si(v
, 1);
616 isl_constraint_set_coefficient(c
, isl_dim_out
, k
, v
);
617 bmap
= isl_basic_map_add_constraint(bmap
, c
);
620 for (i
= 0; i
< tile_len
; ++i
) {
621 c
= isl_equality_alloc(isl_dim_copy(dim
));
622 isl_int_set_si(v
, -1);
623 isl_constraint_set_coefficient(c
, isl_dim_in
, first
+ i
, v
);
624 isl_int_set_si(v
, tile_size
[i
]);
625 isl_constraint_set_coefficient(c
, isl_dim_out
, first
+ i
, v
);
626 isl_int_set_si(v
, 1);
627 isl_constraint_set_coefficient(c
, isl_dim_out
,
628 first
+ i
+ tile_len
, v
);
629 bmap
= isl_basic_map_add_constraint(bmap
, c
);
631 c
= isl_inequality_alloc(isl_dim_copy(dim
));
632 isl_int_set_si(v
, 1);
633 isl_constraint_set_coefficient(c
, isl_dim_out
,
634 first
+ i
+ tile_len
, v
);
635 bmap
= isl_basic_map_add_constraint(bmap
, c
);
637 c
= isl_inequality_alloc(isl_dim_copy(dim
));
638 isl_int_set_si(v
, -1);
639 isl_constraint_set_coefficient(c
, isl_dim_out
,
640 first
+ i
+ tile_len
, v
);
641 isl_int_set_si(v
, tile_size
[i
] - 1);
642 isl_constraint_set_constant(c
, v
);
643 bmap
= isl_basic_map_add_constraint(bmap
, c
);
649 return isl_map_from_basic_map(bmap
);
652 /* Construct a map from a domain of dimensionality "len"
653 * to a domain of dimensionality "len" + "wrap_len" that "wraps"
654 * the "wrap_len" coordinates starting at "first" according to "wrap_size".
655 * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
656 * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
657 * that are projected out at the end.
658 * "dim" prescribes the parameters.
660 static __isl_give isl_map
*wrap(__isl_take isl_dim
*dim
, int len
,
661 int first
, int wrap_len
, int *wrap_size
)
667 dim
= isl_dim_add(dim
, isl_dim_in
, len
);
668 dim
= isl_dim_add(dim
, isl_dim_out
, len
+ 2 * wrap_len
);
669 bmap
= isl_basic_map_universe(isl_dim_copy(dim
));
671 for (i
= 0; i
< len
; ++i
) {
672 int k
= i
< first
+ wrap_len
? i
: i
+ 2 * wrap_len
;
674 c
= isl_equality_alloc(isl_dim_copy(dim
));
675 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, -1);
676 isl_constraint_set_coefficient_si(c
, isl_dim_out
, k
, 1);
677 bmap
= isl_basic_map_add_constraint(bmap
, c
);
680 for (i
= 0; i
< wrap_len
; ++i
) {
681 c
= isl_equality_alloc(isl_dim_copy(dim
));
682 isl_constraint_set_coefficient_si(c
, isl_dim_out
,
684 isl_constraint_set_coefficient_si(c
, isl_dim_out
,
685 first
+ wrap_len
+ i
, 1);
686 isl_constraint_set_coefficient_si(c
, isl_dim_out
,
687 first
+ 2 * wrap_len
+ i
, wrap_size
[i
]);
688 bmap
= isl_basic_map_add_constraint(bmap
, c
);
690 c
= isl_inequality_alloc(isl_dim_copy(dim
));
691 isl_constraint_set_coefficient_si(c
, isl_dim_out
,
692 first
+ wrap_len
+ i
, 1);
693 bmap
= isl_basic_map_add_constraint(bmap
, c
);
695 c
= isl_inequality_alloc(isl_dim_copy(dim
));
696 isl_constraint_set_coefficient_si(c
, isl_dim_out
,
697 first
+ wrap_len
+ i
, -1);
698 isl_constraint_set_constant_si(c
, wrap_size
[i
] - 1);
699 bmap
= isl_basic_map_add_constraint(bmap
, c
);
704 bmap
= isl_basic_map_project_out(bmap
, isl_dim_out
,
705 first
+ 2 * wrap_len
, wrap_len
);
707 return isl_map_from_basic_map(bmap
);
710 /* Add "n" parameters named prefix%d.
712 static __isl_give isl_set
*add_params( __isl_take isl_set
*set
,
713 int n
, const char *prefix
)
719 nparam
= isl_set_dim(set
, isl_dim_param
);
720 set
= isl_set_add_dims(set
, isl_dim_param
, n
);
722 for (i
= 0; i
< n
; ++i
) {
723 snprintf(name
, sizeof(name
), "%s%d", prefix
, i
);
724 set
= isl_set_set_dim_name(set
, isl_dim_param
,
731 /* Equate the "n" dimensions of "set" starting at "first" to
732 * freshly created parameters named prefix%d.
734 static __isl_give isl_set
*parametrize(__isl_take isl_set
*set
,
735 int first
, int n
, const char *prefix
)
744 nparam
= isl_set_dim(set
, isl_dim_param
);
746 set
= add_params(set
, n
, prefix
);
748 dim
= isl_set_get_dim(set
);
749 bset
= isl_basic_set_universe(isl_dim_copy(dim
));
753 for (i
= 0; i
< n
; ++i
) {
754 c
= isl_equality_alloc(isl_dim_copy(dim
));
755 isl_int_set_si(v
, -1);
756 isl_constraint_set_coefficient(c
, isl_dim_param
, nparam
+ i
, v
);
757 isl_int_set_si(v
, 1);
758 isl_constraint_set_coefficient(c
, isl_dim_set
, first
+ i
, v
);
759 bset
= isl_basic_set_add_constraint(bset
, c
);
765 return isl_set_intersect(set
, isl_set_from_basic_set(bset
));
768 static __isl_give isl_set
*parametrization(__isl_take isl_dim
*dim
,
769 int len
, int first
, int n
, const char *prefix
)
773 dim
= isl_dim_add(dim
, isl_dim_set
, len
);
774 set
= isl_set_universe(dim
);
776 return parametrize(set
, first
, n
, prefix
);
779 /* Tile the B loops over the tile sizes and then tile/wrap
780 * the T1 loops over the blocks.
782 static __isl_give isl_union_map
*tile_schedule(struct cuda_gen
*gen
,
783 __isl_take isl_union_map
*sched
)
786 isl_map
*tiling
, *block_tiling
;
788 dim
= isl_union_map_get_dim(sched
);
789 tiling
= tile(isl_dim_copy(dim
), gen
->untiled_len
,
790 gen
->tile_first
, gen
->tile_len
, gen
->tile_size
);
792 if (gen
->options
->wrap
)
793 block_tiling
= wrap(dim
, gen
->untiled_len
+ gen
->tile_len
,
794 gen
->tile_first
, gen
->n_grid
, gen
->grid_dim
);
796 block_tiling
= tile(dim
, gen
->untiled_len
+ gen
->tile_len
,
797 gen
->tile_first
, gen
->n_grid
, gen
->grid_dim
);
799 gen
->tiled_len
= gen
->untiled_len
+ gen
->tile_len
+ gen
->n_grid
;
801 tiling
= isl_map_apply_range(tiling
, block_tiling
);
803 sched
= isl_union_map_apply_range(sched
,
804 isl_union_map_from_map(tiling
));
806 gen
->shared_len
= gen
->tile_first
+ gen
->tile_len
+ gen
->n_grid
;
811 static __isl_give isl_union_map
*parametrize_tiled_schedule(
812 struct cuda_gen
*gen
, __isl_take isl_union_map
*sched
)
817 dim
= isl_union_map_get_dim(sched
);
818 par
= parametrization(dim
, gen
->tiled_len
, 0, gen
->tile_first
, "h");
819 sched
= isl_union_map_intersect_range(sched
,
820 isl_union_set_from_set(par
));
822 dim
= isl_union_map_get_dim(sched
);
823 par
= parametrization(dim
, gen
->tiled_len
,
824 gen
->tile_first
+ gen
->n_grid
, gen
->n_grid
, "b");
825 sched
= isl_union_map_intersect_range(sched
,
826 isl_union_set_from_set(par
));
831 /* Tile/wrap the P1 loops over the threads.
833 static __isl_give isl_union_map
*thread_tile_schedule(struct cuda_gen
*gen
,
834 __isl_take isl_union_map
*sched
)
840 dim
= isl_union_map_get_dim(sched
);
842 if (gen
->options
->wrap
)
843 tiling
= wrap(isl_dim_copy(dim
), gen
->tiled_len
,
844 gen
->shared_len
, gen
->n_block
, gen
->block_dim
);
846 tiling
= tile(isl_dim_copy(dim
), gen
->tiled_len
,
847 gen
->shared_len
, gen
->n_block
, gen
->block_dim
);
848 gen
->thread_tiled_len
= gen
->tiled_len
+ gen
->n_block
;
850 sched
= isl_union_map_apply_range(sched
,
851 isl_union_map_from_map(tiling
));
853 par
= parametrization(dim
, gen
->thread_tiled_len
,
854 gen
->tile_first
+ gen
->tile_len
+ gen
->n_grid
+ gen
->n_block
,
856 sched
= isl_union_map_intersect_range(sched
,
857 isl_union_set_from_set(par
));
859 gen
->shared_len
= gen
->tile_first
+ gen
->tile_len
+ gen
->n_grid
;
864 /* If the user asked for it, scale the shared memory tile loops
865 * (T1P and T2) of "sched" by gen->tile_size[i].
866 * If we are not performing "wrapping", then additionally scale the T1P
867 * loops by gen->grid_dim[i].
869 static __isl_give isl_union_map
*scale_tile_loops(struct cuda_gen
*gen
,
870 __isl_take isl_union_map
*sched
)
874 isl_basic_map
*scale
;
877 if (!gen
->options
->scale_tile_loops
)
880 dim
= isl_union_map_get_dim(sched
);
881 dim
= isl_dim_add(dim
, isl_dim_in
, gen
->tiled_len
);
882 dim
= isl_dim_add(dim
, isl_dim_out
, gen
->tiled_len
);
883 scale
= isl_basic_map_universe(isl_dim_copy(dim
));
885 for (i
= 0; i
< gen
->tiled_len
; ++i
) {
888 if (i
>= gen
->tile_first
&& i
< gen
->tile_first
+ gen
->n_grid
) {
889 f
= gen
->tile_size
[i
- gen
->tile_first
];
890 if (!gen
->options
->wrap
)
891 f
*= gen
->grid_dim
[i
- gen
->tile_first
];
892 } else if (i
>= gen
->tile_first
+ gen
->n_grid
&&
893 i
< gen
->tile_first
+ gen
->n_grid
+ gen
->tile_len
) {
894 f
= gen
->tile_size
[i
- (gen
->tile_first
+ gen
->n_grid
)];
897 c
= isl_equality_alloc(isl_dim_copy(dim
));
898 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, f
);
899 isl_constraint_set_coefficient_si(c
, isl_dim_out
, i
, -1);
900 scale
= isl_basic_map_add_constraint(scale
, c
);
905 sched
= isl_union_map_apply_range(sched
,
906 isl_union_map_from_map(isl_map_from_basic_map(scale
)));
911 /* If we are not performing "wrapping" and if the user asked for it,
912 * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
914 static __isl_give isl_union_map
*scale_thread_tile_loops(struct cuda_gen
*gen
,
915 __isl_take isl_union_map
*sched
)
919 isl_basic_map
*scale
;
922 if (gen
->options
->wrap
)
924 if (!gen
->options
->scale_tile_loops
)
927 dim
= isl_union_map_get_dim(sched
);
928 dim
= isl_dim_add(dim
, isl_dim_in
, gen
->thread_tiled_len
);
929 dim
= isl_dim_add(dim
, isl_dim_out
, gen
->thread_tiled_len
);
930 scale
= isl_basic_map_universe(isl_dim_copy(dim
));
932 for (i
= 0; i
< gen
->thread_tiled_len
; ++i
) {
935 if (i
>= gen
->shared_len
&&
936 i
< gen
->shared_len
+ gen
->n_block
)
937 f
= gen
->block_dim
[i
- gen
->shared_len
];
939 c
= isl_equality_alloc(isl_dim_copy(dim
));
940 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, f
);
941 isl_constraint_set_coefficient_si(c
, isl_dim_out
, i
, -1);
942 scale
= isl_basic_map_add_constraint(scale
, c
);
947 sched
= isl_union_map_apply_range(sched
,
948 isl_union_map_from_map(isl_map_from_basic_map(scale
)));
953 /* If we are not performing "wrapping" and if the user asked for it,
954 * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
956 static __isl_give isl_union_map
*scale_access_tile_loops(struct cuda_gen
*gen
,
957 __isl_take isl_union_map
*sched
, int len
, int first
, int n_tile
)
961 isl_basic_map
*scale
;
964 if (gen
->options
->wrap
)
966 if (!gen
->options
->scale_tile_loops
)
969 dim
= isl_union_map_get_dim(sched
);
970 dim
= isl_dim_add(dim
, isl_dim_in
, len
);
971 dim
= isl_dim_add(dim
, isl_dim_out
, len
);
972 scale
= isl_basic_map_universe(isl_dim_copy(dim
));
974 for (i
= 0; i
< len
; ++i
) {
977 if (i
>= first
&& i
< first
+ n_tile
)
978 f
= gen
->block_dim
[i
- first
];
980 c
= isl_equality_alloc(isl_dim_copy(dim
));
981 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, f
);
982 isl_constraint_set_coefficient_si(c
, isl_dim_out
, i
, -1);
983 scale
= isl_basic_map_add_constraint(scale
, c
);
988 sched
= isl_union_map_apply_range(sched
,
989 isl_union_map_from_map(isl_map_from_basic_map(scale
)));
994 /* If print_user_stmt is set, we want to print the statements ourselves,
995 * instead of relying on the C preprocessor. If so, we need to use
996 * the stop option so that the domains will be saved on the statement
999 static void print_cloog_shared_body(struct cuda_gen
*gen
,
1000 __isl_keep isl_set
*context
, __isl_keep isl_union_map
*sched
, int len
,
1001 void (*print_user_stmt
)(struct gpucode_info
*info
,
1002 struct clast_user_stmt
*s
),
1006 CloogOptions
*options
;
1007 CloogDomain
*cloog_context
;
1008 CloogUnionDomain
*ud
;
1010 struct clast_stmt
*stmt
;
1013 sched
= isl_union_map_copy(sched
);
1014 sched
= isl_union_map_align_params(sched
, isl_set_get_dim(context
));
1016 options
= cloog_options_malloc(gen
->state
);
1017 options
->language
= LANGUAGE_C
;
1018 options
->strides
= 1;
1022 options
->override
= 1;
1023 options
->save_domains
= 1;
1024 options
->noscalars
= 1;
1025 options
->first_unroll
= first_unroll
;
1027 ud
= cloog_union_domain_from_isl_union_map(sched
);
1028 for (i
= 0; i
< len
; ++i
) {
1029 snprintf(name
, sizeof(name
), "c%d", i
);
1030 ud
= cloog_union_domain_set_name(ud
, CLOOG_SCAT
, i
, name
);
1032 cloog_context
= cloog_domain_from_isl_set(isl_set_copy(context
));
1033 input
= cloog_input_alloc(cloog_context
, ud
);
1035 stmt
= cloog_clast_create_from_input(input
, options
);
1037 gen
->stmt_code
.indent
= gen
->kernel_code
.indent
;
1038 gen
->stmt_code
.dst
= gen
->cuda
.kernel_c
;
1039 gen
->stmt_code
.print_user_stmt
= print_user_stmt
;
1040 gen
->stmt_code
.print_user_stmt_list
= NULL
;
1041 gen
->stmt_code
.print_for_head
= NULL
;
1042 gen
->stmt_code
.print_for_foot
= NULL
;
1043 gen
->stmt_code
.user
= gen
;
1044 gpu_print_host_stmt(&gen
->stmt_code
, stmt
);
1046 cloog_clast_free(stmt
);
1047 cloog_options_free(options
);
1050 /* Add "len" parameters p[i] called prefix%d,
1051 * with bounds to 0 <= p[i] < size[i].
1053 __isl_give isl_set
*add_bounded_parameters(__isl_take isl_set
*set
,
1054 int len
, int *size
, const char *prefix
)
1060 isl_basic_set
*bset
;
1064 nparam
= isl_set_dim(set
, isl_dim_param
);
1065 set
= isl_set_add_dims(set
, isl_dim_param
, len
);
1067 for (i
= 0; i
< len
; ++i
) {
1068 snprintf(name
, sizeof(name
), "%s%d", prefix
, i
);
1069 set
= isl_set_set_dim_name(set
, isl_dim_param
,
1073 dim
= isl_set_get_dim(set
);
1074 bset
= isl_basic_set_universe(isl_dim_copy(dim
));
1078 for (i
= 0; i
< len
; ++i
) {
1079 c
= isl_inequality_alloc(isl_dim_copy(dim
));
1080 isl_int_set_si(v
, 1);
1081 isl_constraint_set_coefficient(c
, isl_dim_param
, nparam
+ i
, v
);
1082 bset
= isl_basic_set_add_constraint(bset
, c
);
1084 c
= isl_inequality_alloc(isl_dim_copy(dim
));
1085 isl_int_set_si(v
, -1);
1086 isl_constraint_set_coefficient(c
, isl_dim_param
, nparam
+ i
, v
);
1087 isl_int_set_si(v
, size
[i
] - 1);
1088 isl_constraint_set_constant(c
, v
);
1089 bset
= isl_basic_set_add_constraint(bset
, c
);
1095 return isl_set_intersect(set
, isl_set_from_basic_set(bset
));
1098 static void print_shared_body(struct cuda_gen
*gen
,
1099 __isl_keep isl_set
*shared_domain
, __isl_keep isl_union_map
*sched
,
1100 int len
, void (*print_user_stmt
)(struct gpucode_info
*info
,
1101 struct clast_user_stmt
*s
),
1106 context
= isl_set_copy(shared_domain
);
1107 context
= parametrize(context
, 0, gen
->shared_len
, "g");
1108 context
= isl_set_project_out(context
, isl_dim_set
, 0, gen
->shared_len
);
1109 context
= add_bounded_parameters(context
,
1110 gen
->n_block
, gen
->block_dim
, "t");
1112 print_cloog_shared_body(gen
, context
, sched
, len
, print_user_stmt
,
1115 isl_set_free(context
);
1118 /* Given a tile of an array, construct a map that maps each element
1119 * of the tile to a copy of the tile shifted to the origin
1120 * (based on the lower bounds in group->private_bound or group->shared_bound).
1121 * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1122 * is applied to the index first.
1123 * The domain of the resulting map is "access",
1124 * while the range space is anonymous.
1126 static __isl_give isl_map
*shift_access(__isl_take isl_set
*access
,
1127 struct cuda_array_ref_group
*group
)
1131 isl_basic_set
*bset
;
1132 isl_basic_map
*bmap
;
1134 isl_basic_set
*offset
;
1135 isl_basic_map
*shift
;
1136 isl_basic_map
*pre_shift
;
1139 struct cuda_array_bound
*bounds
;
1140 int n_index
= group
->array
->n_index
;
1142 bounds
= group
->private_bound
;
1144 bounds
= group
->shared_bound
;
1146 dim
= isl_set_get_dim(access
);
1147 dim
= isl_dim_drop(dim
, isl_dim_set
, 0, n_index
);
1148 offset
= isl_basic_set_universe(dim
);
1149 for (i
= 0; i
< n_index
; ++i
) {
1150 lb
= isl_aff_copy(bounds
[i
].lb
);
1151 bmap
= isl_basic_map_from_aff(lb
);
1152 bset
= isl_basic_map_range(bmap
);
1153 offset
= isl_basic_set_flat_product(offset
, bset
);
1155 offset
= isl_basic_set_neg(offset
);
1157 dim
= isl_dim_map_from_set(isl_set_get_dim(access
));
1158 shift
= isl_basic_map_identity(dim
);
1159 shift
= isl_basic_map_set_tuple_name(shift
, isl_dim_out
, NULL
);
1161 bset
= isl_basic_set_universe(isl_set_get_dim(access
));
1162 bmap
= isl_basic_map_from_domain_and_range(bset
, offset
);
1164 shift
= isl_basic_map_sum(shift
, bmap
);
1166 dim
= isl_set_get_dim(access
);
1167 dim
= isl_dim_drop(dim
, isl_dim_set
, 0, n_index
);
1168 dim
= isl_dim_map_from_set(dim
);
1169 pre_shift
= isl_basic_map_universe(isl_dim_copy(dim
));
1170 dim
= isl_dim_add(dim
, isl_dim_in
, 1);
1171 dim
= isl_dim_add(dim
, isl_dim_out
, 1);
1172 for (i
= 0; i
< n_index
; ++i
) {
1173 if (!bounds
[i
].shift_map
)
1174 bmap
= isl_basic_map_identity(isl_dim_copy(dim
));
1176 bmap
= isl_basic_map_copy(bounds
[i
].shift_map
);
1177 pre_shift
= isl_basic_map_flat_product(pre_shift
, bmap
);
1180 name
= isl_basic_map_get_tuple_name(shift
, isl_dim_in
);
1181 pre_shift
= isl_basic_map_set_tuple_name(pre_shift
, isl_dim_in
, name
);
1182 pre_shift
= isl_basic_map_set_tuple_name(pre_shift
, isl_dim_out
, name
);
1183 shift
= isl_basic_map_apply_range(pre_shift
, shift
);
1185 sched
= isl_map_from_basic_map(shift
);
1186 sched
= isl_map_intersect_domain(sched
, access
);
1191 /* Construct a schedule for iterating over all elements in the given
1192 * piece of an array. The schedule iterates over a copy of the piece
1193 * that is shifted to the origin.
1194 * We subsequently also perform the tiling/wrapping over the threads.
1196 * In particular, we tile the final iterators so that the final thread
1197 * dimension runs over the final array dimension.
1198 * However, if those final iterators have only a single iteration,
1199 * we try to tile earlier iterators instead.
1201 static __isl_give isl_union_map
*access_schedule(struct cuda_gen
*gen
,
1202 __isl_take isl_set
*access
, struct cuda_array_ref_group
*group
)
1206 isl_union_map
*usched
;
1209 unsigned nvar
= isl_set_dim(access
, isl_dim_set
);
1213 sched
= shift_access(access
, group
);
1215 n_tile
= gen
->n_block
;
1216 if (n_tile
> nvar
) {
1218 sched
= isl_map_insert(sched
, isl_dim_out
, 0, n_tile
- nvar
);
1219 for (i
= 0; i
< n_tile
- nvar
; ++i
)
1220 sched
= isl_map_fix_si(sched
, isl_dim_out
, i
, 0);
1224 first
= nvar
- n_tile
;
1226 for (; first
> 0; first
--)
1227 if (!isl_map_plain_is_fixed(sched
, isl_dim_out
,
1228 first
+ n_tile
- 1, NULL
))
1231 dim
= isl_map_get_dim(sched
);
1232 dim
= isl_dim_drop(dim
, isl_dim_in
, 0, isl_dim_size(dim
, isl_dim_in
));
1233 dim
= isl_dim_drop(dim
, isl_dim_out
, 0, nvar
);
1234 if (gen
->options
->wrap
)
1235 tiling
= wrap(isl_dim_copy(dim
), nvar
, first
,
1236 n_tile
, gen
->block_dim
);
1238 tiling
= tile(isl_dim_copy(dim
), nvar
, first
,
1239 n_tile
, gen
->block_dim
);
1240 sched
= isl_map_apply_range(sched
, tiling
);
1242 par
= parametrization(dim
, nvar
+ n_tile
, first
+ n_tile
, n_tile
, "t");
1243 usched
= isl_union_map_from_map(sched
);
1244 usched
= isl_union_map_intersect_range(usched
,
1245 isl_union_set_from_set(par
));
1247 usched
= scale_access_tile_loops(gen
, usched
, nvar
+ n_tile
,
1253 static void print_shared_access(struct cuda_gen
*gen
,
1254 __isl_keep isl_set
*shared_domain
, __isl_take isl_set
*access
,
1255 const char *type
, struct cuda_array_ref_group
*group
)
1257 const char *array_name
;
1260 isl_union_map
*sched
;
1261 unsigned nvar
= isl_set_dim(access
, isl_dim_set
);
1264 ctx
= isl_set_get_ctx(access
);
1265 array_name
= isl_set_get_tuple_name(access
);
1266 name
= isl_alloc_array(ctx
, char,
1267 strlen(type
) + sizeof("_shared_") + strlen(array_name
) + 20);
1268 if (group
->array
->n_group
> 1)
1269 sprintf(name
, "%s_shared_%s_%d", type
, array_name
, group
->nr
);
1271 sprintf(name
, "%s_shared_%s", type
, array_name
);
1272 access
= isl_set_set_tuple_name(access
, name
);
1275 sched
= access_schedule(gen
, access
, group
);
1277 n_tile
= gen
->n_block
;
1281 print_shared_body(gen
, shared_domain
, sched
, nvar
+ n_tile
, NULL
, -1);
1283 isl_union_map_free(sched
);
1286 /* Return the union of all read (read = 1) and/or write (write = 1)
1287 * access relations in the group.
1289 static __isl_give isl_union_map
*group_access_relation(
1290 struct cuda_array_ref_group
*group
, int read
, int write
)
1293 isl_union_map
*access
;
1295 access
= isl_union_map_empty(isl_map_get_dim(group
->access
));
1296 for (i
= 0; i
< group
->n_ref
; ++i
) {
1299 if (!((read
&& group
->refs
[i
]->read
) ||
1300 (write
&& group
->refs
[i
]->write
)))
1302 map_i
= isl_map_copy(group
->refs
[i
]->access
);
1303 access
= isl_union_map_union(access
,
1304 isl_union_map_from_map(map_i
));
1310 /* Check that none of the shared memory tiles involve any strides.
1312 static int no_strides(struct cuda_array_ref_group
*group
)
1315 int n_index
= group
->array
->n_index
;
1317 for (i
= 0; i
< n_index
; ++i
)
1318 if (group
->shared_bound
[i
].shift
)
1324 /* Return a set containing the values of the given index i
1325 * of the elements in the array tile in global memory that corresponds
1326 * to the shared memory copy.
1327 * In particular, if a is the index, we return a set with constraints
1329 * tile_offset <= a <= tile_offset + tile_size - 1
1333 * 0 <= a <= array_size - 1
1336 static __isl_give isl_set
*group_tile_dim(struct cuda_array_ref_group
*group
,
1339 isl_basic_set
*tile
;
1342 isl_local_space
*ls
;
1347 aff
= isl_aff_copy(group
->shared_bound
[i
].lb
);
1348 aff
= isl_aff_add_dims(aff
, isl_dim_set
, 1);
1349 ls
= isl_aff_get_local_space(aff
);
1350 aff
= isl_aff_neg(aff
);
1351 aff
= isl_aff_add_coefficient_si(aff
, isl_dim_set
, 0, 1);
1352 c
= isl_inequality_from_aff(isl_aff_copy(aff
));
1353 tile
= isl_basic_set_from_constraint(c
);
1355 aff
= isl_aff_neg(aff
);
1356 aff
= isl_aff_add_constant(aff
, group
->shared_bound
[i
].size
);
1357 aff
= isl_aff_add_constant_si(aff
, -1);
1358 c
= isl_inequality_from_aff(aff
);
1359 tile
= isl_basic_set_add_constraint(tile
, c
);
1361 aff
= isl_aff_zero(ls
);
1362 aff
= isl_aff_add_coefficient_si(aff
, isl_dim_set
, 0, 1);
1363 c
= isl_inequality_from_aff(aff
);
1364 tile
= isl_basic_set_add_constraint(tile
, c
);
1366 bound
= isl_pw_aff_copy(group
->array
->bound
[i
]);
1367 bound
= isl_pw_aff_add_dims(bound
, isl_dim_set
, 1);
1368 ls
= isl_local_space_from_dim(isl_pw_aff_get_dim(bound
));
1369 aff
= isl_aff_zero(ls
);
1370 aff
= isl_aff_add_coefficient_si(aff
, isl_dim_set
, 0, 1);
1371 aff
= isl_aff_add_constant_si(aff
, 1);
1372 dom
= isl_pw_aff_domain(isl_pw_aff_copy(bound
));
1374 tile_set
= isl_pw_aff_ge_set(bound
, isl_pw_aff_alloc(dom
, aff
));
1375 tile_set
= isl_set_align_params(tile_set
, isl_basic_set_get_dim(tile
));
1376 tile_set
= isl_set_intersect(tile_set
, isl_set_from_basic_set(tile
));
1381 /* Return a set containing the elements in the array tile in
1382 * global memory that corresponds to the shared memory copy.
1384 static __isl_give isl_set
*group_tile(struct cuda_array_ref_group
*group
)
1387 int n_index
= group
->array
->n_index
;
1390 tile
= group_tile_dim(group
, 0);
1391 for (i
= 1; i
< n_index
; ++i
) {
1394 tile_i
= group_tile_dim(group
, i
);
1395 tile
= isl_set_flat_product(tile
, tile_i
);
1398 tile
= isl_set_set_tuple_name(tile
, group
->array
->name
);
1403 /* Print code for reading into or writing from shared memory
1404 * the given array reference group.
1406 * sched maps the original iteration domains to the shared memory tile loops.
1408 * If we are performing a read from global memory to shared memory,
1409 * if the array involved is not a scalar and if the definition of the
1410 * shared memory tiles does not involve any strides, then we copy
1411 * the entire tile to shared memory. This may result in some extra
1412 * elements getting copied, but it should lead to simpler code
1413 * (which means that fewer registers may be needed) and less divergence.
1415 * Otherwise, we only copy the elements that will be read or have been written
1418 * Note that the absence of stride requirement can easily be lifted.
1419 * We would just need to add constraints of the form
1421 * shift + a = stride * alpha
1423 static int print_group_shared_accesses(struct cuda_gen
*gen
,
1424 struct cuda_array_ref_group
*group
, const char *type
,
1425 __isl_keep isl_set
*shared_domain
, __isl_keep isl_union_map
*sched
)
1428 isl_union_map
*access
;
1429 isl_union_set
*uset
;
1430 isl_set
*access_set
;
1432 if (group
->private_bound
)
1434 if (!group
->shared_bound
)
1437 read
= !strcmp(type
, "read");
1439 access
= group_access_relation(group
, read
, !read
);
1440 access
= isl_union_map_apply_domain(access
, isl_union_map_copy(sched
));
1441 uset
= isl_union_map_range(access
);
1443 if (isl_union_set_is_empty(uset
)) {
1444 isl_union_set_free(uset
);
1448 if (read
&& group
->array
->n_index
> 0 && no_strides(group
)) {
1449 isl_union_set_free(uset
);
1450 access_set
= group_tile(group
);
1451 print_shared_access(gen
, shared_domain
, access_set
,
1456 access_set
= isl_set_from_union_set(uset
);
1457 access_set
= isl_set_coalesce(access_set
);
1459 print_shared_access(gen
, shared_domain
, access_set
, type
, group
);
1464 /* Print code for reading into or writing from shared memory at
1465 * the given level (-1 for innermost).
1467 * If we are not printing at the innermost level, then the dimensionality
1468 * of shared_domain may be smaller than gen->shared_len.
1469 * As the rest of the code assumes that the domain of access has
1470 * gen->shared_len dimensions, we therefore may need to embed this domain
1471 * in a higher dimensional space after intersection with shared_domain.
1473 static void print_shared_accesses(struct cuda_gen
*gen
,
1474 __isl_keep isl_set
*shared_domain
, __isl_keep isl_union_map
*access
,
1475 const char *type
, int level
)
1481 int shared_len
= isl_set_dim(shared_domain
, isl_dim_set
);
1483 isl_union_map
*sched
;
1485 shared_domain
= isl_set_copy(shared_domain
);
1486 sched
= isl_union_map_copy(gen
->tiled_sched
);
1487 dim
= isl_union_map_get_dim(sched
);
1488 proj
= projection(dim
, gen
->tiled_len
, shared_len
);
1489 sched
= isl_union_map_apply_range(sched
, isl_union_map_from_map(proj
));
1490 sched
= isl_union_map_intersect_range(sched
,
1491 isl_union_set_from_set(isl_set_copy(shared_domain
)));
1492 if (shared_len
!= gen
->shared_len
) {
1493 dim
= isl_union_map_get_dim(sched
);
1494 proj
= projection(dim
, gen
->shared_len
, shared_len
);
1495 proj
= isl_map_reverse(proj
);
1496 shared_domain
= isl_set_apply(shared_domain
,
1497 isl_map_copy(proj
));
1498 sched
= isl_union_map_apply_range(sched
,
1499 isl_union_map_from_map(proj
));
1502 dim
= isl_union_map_get_dim(sched
);
1503 par
= parametrization(dim
, gen
->shared_len
, 0, gen
->shared_len
, "g");
1504 sched
= isl_union_map_intersect_range(sched
,
1505 isl_union_set_from_set(par
));
1507 for (i
= 0; i
< gen
->n_array
; ++i
) {
1508 struct cuda_array_info
*array
= &gen
->array
[i
];
1510 if (gen
->array
[i
].print_shared_level
!= level
)
1513 for (j
= 0; j
< array
->n_group
; ++j
) {
1514 if (print_group_shared_accesses(gen
, array
->groups
[j
],
1515 type
, shared_domain
, sched
))
1520 isl_union_map_free(sched
);
1521 isl_set_free(shared_domain
);
1524 print_indent(gen
->cuda
.kernel_c
, gen
->kernel_code
.indent
);
1525 fprintf(gen
->cuda
.kernel_c
, "__syncthreads();\n");
1529 /* Given an index expression into a tile of an array, adjust the expression
1530 * to a shift of the tile to the origin
1531 * (based on the lower bounds in array->shared_bound).
1532 * If the index is strided, then we first add
1533 * bound->shift and divide by bound->stride.
1535 static __isl_give isl_qpolynomial
*shift_index(__isl_take isl_qpolynomial
*qp
,
1536 struct cuda_array_info
*array
,
1537 struct cuda_array_bound
*bound
, __isl_take isl_set
*domain
)
1539 isl_qpolynomial
*lb
;
1542 isl_qpolynomial
*shift
, *t
;
1545 shift
= bound
->shift
;
1546 shift
= isl_qpolynomial_copy(shift
);
1547 shift
= isl_qpolynomial_drop_dims(shift
, isl_dim_set
, 0,
1548 isl_qpolynomial_dim(shift
, isl_dim_set
));
1549 shift
= isl_qpolynomial_align_params(shift
,
1550 isl_qpolynomial_get_dim(qp
));
1551 qp
= isl_qpolynomial_add(qp
, shift
);
1552 dim
= isl_qpolynomial_get_dim(qp
);
1554 isl_int_set_si(one
, 1);
1555 t
= isl_qpolynomial_rat_cst(dim
, one
, bound
->stride
);
1557 qp
= isl_qpolynomial_mul(qp
, t
);
1560 lb
= isl_qpolynomial_from_aff(isl_aff_copy(bound
->lb
));
1561 lb
= isl_qpolynomial_drop_dims(lb
, isl_dim_set
, 0,
1562 isl_qpolynomial_dim(lb
, isl_dim_set
));
1564 lb
= isl_qpolynomial_align_params(lb
, isl_qpolynomial_get_dim(qp
));
1566 qp
= isl_qpolynomial_sub(qp
, lb
);
1567 qp
= isl_qpolynomial_gist(qp
, domain
);
1572 /* This function is called for each access to an array in some statement
1573 * in the original code.
1574 * Replace that access by an access to shared or (linearized) global memory.
1575 * Since the array in shared memory is just
1576 * a shifted copy of part of the original array, we simply need
1577 * to subtract the lower bound, which was computed
1578 * in can_tile_for_shared_memory.
1579 * If any of the indices is strided, then we first add
1580 * shared_bound[i].shift and divide by shared_bound[i].stride.
1582 * If the given array is accessed directly from global memory,
1583 * we don't need to perform any shifting and simply simplify
1584 * expression in the context of the domain instead.
1586 * If the array space (range of access) has no name, then we are
1587 * accessing an iterator in the original program.
1589 static void print_access(struct cuda_gen
*gen
, __isl_take isl_map
*access
,
1595 struct cuda_array_info
*array
= NULL
;
1600 struct cuda_array_bound
*bounds
= NULL
;
1602 access
= isl_map_align_params(access
,
1603 isl_set_get_dim(gen
->stmt_domain
));
1605 data_set
= isl_set_apply(isl_set_copy(gen
->stmt_domain
), access
);
1607 name
= isl_set_get_tuple_name(data_set
);
1610 fprintf(gen
->cuda
.kernel_c
, "(");
1612 struct cuda_array_ref_group
*group
;
1614 for (i
= 0; i
< gen
->n_array
; ++i
) {
1615 if (strcmp(name
, gen
->array
[i
].name
))
1617 array
= &gen
->array
[i
];
1620 group
= array
->groups
[group_nr
];
1621 bounds
= group
->private_bound
;
1623 bounds
= group
->shared_bound
;
1625 print_array_name(gen
->cuda
.kernel_c
, group
);
1626 fprintf(gen
->cuda
.kernel_c
, "[");
1630 n_index
= isl_set_dim(data_set
, isl_dim_set
);
1631 aff
= isl_set_affine_hull(data_set
);
1633 prn
= isl_printer_to_file(gen
->ctx
, gen
->cuda
.kernel_c
);
1634 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
1637 for (i
= 0; i
+ 1 < n_index
; ++i
)
1638 prn
= isl_printer_print_str(prn
, "(");
1640 for (i
= 0; i
< n_index
; ++i
) {
1642 isl_qpolynomial
*qp
;
1645 ok
= isl_basic_set_has_defining_equality(aff
,
1646 isl_dim_out
, i
, &c
);
1648 qp
= isl_qpolynomial_from_constraint(c
, isl_dim_out
, i
);
1649 qp
= isl_qpolynomial_drop_dims(qp
, isl_dim_set
, 0,
1650 isl_qpolynomial_dim(qp
, isl_dim_set
));
1653 prn
= isl_printer_print_qpolynomial(prn
, qp
);
1654 isl_qpolynomial_free(qp
);
1658 domain
= isl_set_copy(gen
->stmt_domain
);
1659 domain
= isl_set_project_out(domain
, isl_dim_set
, 0,
1660 isl_set_dim(domain
, isl_dim_set
));
1662 qp
= isl_qpolynomial_gist(qp
, domain
);
1664 qp
= shift_index(qp
, array
, &bounds
[i
], domain
);
1668 prn
= isl_printer_print_str(prn
, ") * (");
1669 prn
= isl_printer_print_pw_aff(prn
,
1670 array
->local_bound
[i
]);
1671 prn
= isl_printer_print_str(prn
, ") + ");
1673 prn
= isl_printer_print_str(prn
, "][");
1675 prn
= isl_printer_print_qpolynomial(prn
, qp
);
1676 isl_qpolynomial_free(qp
);
1679 prn
= isl_printer_print_str(prn
, ")");
1681 prn
= isl_printer_print_str(prn
, "]");
1682 isl_printer_free(prn
);
1684 isl_basic_set_free(aff
);
1687 static void print_stmt_body(struct cuda_gen
*gen
,
1688 FILE *out
, struct cuda_stmt
*stmt
)
1691 struct cuda_stmt_access
*access
;
1693 for (access
= stmt
->accesses
; access
; access
= access
->next
) {
1694 fwrite(stmt
->text
+ last
, 1, access
->text_offset
- last
, out
);
1695 last
= access
->text_offset
+ access
->text_len
;
1697 print_access(gen
, isl_map_copy(access
->access
),
1701 fprintf(out
, "%s\n", stmt
->text
+ last
);
1704 /* This function is called for each leaf in the innermost clast,
1705 * i.e., for each statemetn.
1706 * We print the statement body, simplifying the accesses based
1709 static void print_statement(struct gpucode_info
*code
,
1710 struct clast_user_stmt
*u
)
1712 struct cuda_gen
*gen
= code
->user
;
1715 isl_set
*stmt_domain
;
1716 isl_union_map
*stmt_sched
;
1717 isl_union_set
*uset
;
1719 struct cuda_stmt
*stmt
;
1721 nr
= atoi(u
->statement
->name
+ 2);
1722 stmt
= &gen
->stmts
[nr
];
1724 stmt_domain
= extract_host_domain(u
);
1726 stmt_sched
= isl_union_map_intersect_range(
1727 isl_union_map_copy(gen
->local_sched
),
1728 isl_union_set_from_set(extend(stmt_domain
,
1729 gen
->thread_tiled_len
)));
1730 dim
= isl_union_map_get_dim(stmt_sched
);
1731 par
= parametrization(dim
, gen
->thread_tiled_len
, 0,
1732 gen
->thread_tiled_len
, "c");
1733 stmt_sched
= isl_union_map_intersect_range(stmt_sched
,
1734 isl_union_set_from_set(par
));
1736 uset
= isl_union_map_domain(stmt_sched
);
1737 dim
= isl_union_set_get_dim(uset
);
1738 dim
= isl_dim_add(dim
, isl_dim_set
,
1739 isl_set_dim(stmt
->domain
, isl_dim_set
));
1740 dim
= isl_dim_set_tuple_name(dim
, isl_dim_set
, u
->statement
->name
);
1741 gen
->stmt_domain
= isl_union_set_extract_set(uset
, dim
);
1742 isl_union_set_free(uset
);
1744 print_indent(code
->dst
, code
->indent
);
1745 print_stmt_body(gen
, code
->dst
, stmt
);
1747 isl_set_free(gen
->stmt_domain
);
1750 /* Print an access to the element in the global memory copy of the
1751 * given array that corresponds to element [qp[0]][qp[1]]...
1752 * of the original array.
1753 * The copy in global memory has been linearized, so we need to take
1754 * the array size into account.
1756 static void print_private_global_index(isl_ctx
*ctx
, FILE *out
,
1757 struct cuda_array_info
*array
, __isl_keep isl_qpolynomial
**qp
)
1762 fprintf(out
, "%s[", array
->name
);
1763 prn
= isl_printer_to_file(ctx
, out
);
1764 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
1765 for (i
= 0; i
+ 1 < array
->n_index
; ++i
)
1766 prn
= isl_printer_print_str(prn
, "(");
1767 for (i
= 0; i
< array
->n_index
; ++i
) {
1769 prn
= isl_printer_print_str(prn
, ") * (");
1770 prn
= isl_printer_print_pw_aff(prn
,
1771 array
->local_bound
[i
]);
1772 prn
= isl_printer_print_str(prn
, ") + ");
1774 prn
= isl_printer_print_qpolynomial(prn
, qp
[i
]);
1776 isl_printer_free(prn
);
1780 /* Print an access to the element in the shared memory copy of the
1781 * given array reference group that corresponds to element [qps[0]][qps[1]]...
1782 * of the original array.
1783 * Since the array in shared memory is just a shifted copy of part
1784 * of the original array, we simply need to subtract the lower bound,
1785 * which was computed in can_tile_for_shared_memory.
1786 * If any of the indices is strided, then we first add
1787 * shared_bound[i].shift and divide by shared_bound[i].stride.
1789 static void print_private_local_index(isl_ctx
*ctx
, FILE *out
,
1790 struct cuda_array_ref_group
*group
,
1791 __isl_keep isl_qpolynomial
**qps
, __isl_keep isl_set
*domain
)
1795 struct cuda_array_info
*array
= group
->array
;
1796 struct cuda_array_bound
*bounds
= group
->private_bound
;
1798 print_array_name(out
, group
);
1799 for (i
= 0; i
< array
->n_index
; ++i
) {
1800 isl_qpolynomial
*qp
= isl_qpolynomial_copy(qps
[i
]);
1802 qp
= shift_index(qp
, array
, &bounds
[i
], isl_set_copy(domain
));
1805 prn
= isl_printer_to_file(ctx
, out
);
1806 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
1807 prn
= isl_printer_print_qpolynomial(prn
, qp
);
1808 isl_printer_free(prn
);
1810 isl_qpolynomial_free(qp
);
1814 /* This function is called for each leaf in the clast of the code
1815 * for copying to or from private memory.
1816 * The statement name is read_private_<array> or write_private_<array>.
1818 * The schedule iterates over the array elements, so we can use
1819 * the domain of private_sched at the current scheduling position
1820 * as the index of the array.
1822 static void print_private_copy_statement(struct gpucode_info
*code
,
1823 struct clast_user_stmt
*u
)
1825 struct cuda_gen
*gen
= code
->user
;
1828 struct cuda_array_ref_group
*group
= gen
->private_group
;
1837 isl_qpolynomial
**qp
;
1840 read
= !strncmp(u
->statement
->name
, "read", 4);
1842 domain
= extract_host_domain(u
);
1845 sched
= isl_map_copy(gen
->private_sched
);
1846 sched
= isl_map_reverse(sched
);
1847 sched
= isl_map_intersect_domain(sched
, domain
);
1848 n_in
= isl_map_dim(sched
, isl_dim_in
);
1849 n_out
= isl_map_dim(sched
, isl_dim_out
);
1850 dim
= isl_map_get_dim(sched
);
1851 dim
= isl_dim_drop(dim
, isl_dim_in
, 0, n_in
);
1852 dim
= isl_dim_drop(dim
, isl_dim_out
, 0, n_out
);
1853 param
= parametrization(dim
, n_in
, 0, n_in
, "c");
1854 sched
= isl_map_align_params(sched
, isl_set_get_dim(param
));
1855 sched
= isl_map_intersect_domain(sched
, param
);
1856 index
= isl_map_range(sched
);
1857 domain
= isl_set_copy(index
);
1858 aff
= isl_set_affine_hull(index
);
1859 domain
= isl_set_project_out(domain
, isl_dim_set
, 0, n_out
);
1861 ctx
= isl_basic_set_get_ctx(aff
);
1862 qp
= isl_alloc_array(ctx
, isl_qpolynomial
*, n_out
);
1865 for (i
= 0; i
< n_out
; ++i
) {
1869 ok
= isl_basic_set_has_defining_equality(aff
,
1870 isl_dim_set
, i
, &c
);
1872 qp
[i
] = isl_qpolynomial_from_constraint(c
, isl_dim_set
, i
);
1873 qp
[i
] = isl_qpolynomial_drop_dims(qp
[i
], isl_dim_set
, 0, n_out
);
1876 print_indent(code
->dst
, code
->indent
);
1878 print_private_local_index(ctx
, code
->dst
, group
, qp
, domain
);
1879 fprintf(code
->dst
, " = ");
1880 print_private_global_index(ctx
, code
->dst
, group
->array
, qp
);
1882 print_private_global_index(ctx
, code
->dst
, group
->array
, qp
);
1883 fprintf(code
->dst
, " = ");
1884 print_private_local_index(ctx
, code
->dst
, group
, qp
, domain
);
1886 fprintf(code
->dst
, ";\n");
1888 for (i
= 0; i
< n_out
; ++i
)
1889 isl_qpolynomial_free(qp
[i
]);
1892 isl_basic_set_free(aff
);
1893 isl_set_free(domain
);
1896 static void print_private_access(struct cuda_gen
*gen
,
1897 __isl_keep isl_set
*shared_domain
, __isl_take isl_set
*access
,
1898 const char *type
, struct cuda_array_ref_group
*group
)
1900 const char *array_name
;
1903 unsigned nvar
= isl_set_dim(access
, isl_dim_set
);
1904 isl_union_map
*usched
;
1906 if (isl_set_fast_is_empty(access
)) {
1907 isl_set_free(access
);
1911 ctx
= isl_set_get_ctx(access
);
1912 array_name
= isl_set_get_tuple_name(access
);
1913 name
= isl_alloc_array(ctx
, char,
1914 strlen(type
) + sizeof("_private_") + strlen(array_name
) + 20);
1915 if (group
->array
->n_group
> 1)
1916 sprintf(name
, "%s_private_%s_%d", type
, array_name
, group
->nr
);
1918 sprintf(name
, "%s_private_%s", type
, array_name
);
1919 access
= isl_set_set_tuple_name(access
, name
);
1922 gen
->private_sched
= shift_access(access
, group
);
1923 gen
->private_group
= group
;
1925 usched
= isl_union_map_from_map(isl_map_copy(gen
->private_sched
));
1926 print_shared_body(gen
, shared_domain
, usched
, nvar
,
1927 &print_private_copy_statement
, 1);
1928 isl_union_map_free(usched
);
1930 isl_map_free(gen
->private_sched
);
1933 /* Print code for reading into or writing from private memory
1934 * the given array reference group.
1936 * sched maps the original iteration domains to the shared memory tile loops.
1938 static void print_group_private_accesses(struct cuda_gen
*gen
,
1939 struct cuda_array_ref_group
*group
,
1940 const char *type
, __isl_keep isl_set
*shared_domain
,
1941 unsigned first_shared
, int shared_len
, __isl_keep isl_union_map
*sched
)
1944 isl_union_map
*access
;
1945 isl_union_set
*uset
;
1946 isl_set
*access_set
;
1948 if (!group
->private_bound
)
1951 read
= !strcmp(type
, "read");
1953 access
= group_access_relation(group
, read
, !read
);
1954 access
= isl_union_map_apply_domain(access
, isl_union_map_copy(sched
));
1955 access
= isl_union_map_intersect(access
,
1956 isl_union_map_copy(gen
->private_access
));
1957 uset
= isl_union_map_range(access
);
1959 if (isl_union_set_is_empty(uset
)) {
1960 isl_union_set_free(uset
);
1964 access_set
= isl_set_from_union_set(uset
);
1965 access_set
= isl_set_coalesce(access_set
);
1966 access_set
= isl_set_eliminate(access_set
, isl_dim_param
,
1967 first_shared
+ shared_len
,
1968 gen
->shared_len
- shared_len
);
1970 print_private_access(gen
, shared_domain
, access_set
, type
, group
);
1973 /* Print code for reading into or writing from private memory at
1974 * the given level (-1 for innermost).
1976 * If we are not printing at the innermost level, then the dimensionality
1977 * of shared_domain may be smaller than gen->shared_len.
1978 * As the rest of the code assumes that the domain of access has
1979 * gen->shared_len dimensions, we therefore may need to embed this domain
1980 * in a higher dimensional space after intersection with shared_domain.
1982 * This code is very similar to print_shared_accesses.
1983 * The main difference is that we to take into account gen->private_access.
1985 static void print_private_accesses(struct cuda_gen
*gen
,
1986 __isl_keep isl_set
*shared_domain
, __isl_keep isl_union_map
*access
,
1987 const char *type
, int level
)
1992 int shared_len
= isl_set_dim(shared_domain
, isl_dim_set
);
1993 unsigned first_shared
;
1994 isl_union_map
*sched
;
1996 shared_domain
= isl_set_copy(shared_domain
);
1997 sched
= isl_union_map_copy(gen
->tiled_sched
);
1998 dim
= isl_union_map_get_dim(sched
);
1999 first_shared
= isl_dim_size(dim
, isl_dim_param
);
2000 proj
= projection(dim
, gen
->tiled_len
, shared_len
);
2001 sched
= isl_union_map_apply_range(sched
, isl_union_map_from_map(proj
));
2002 sched
= isl_union_map_intersect_range(sched
,
2003 isl_union_set_from_set(isl_set_copy(shared_domain
)));
2004 if (shared_len
!= gen
->shared_len
) {
2005 dim
= isl_union_map_get_dim(sched
);
2006 proj
= projection(dim
, gen
->shared_len
, shared_len
);
2007 proj
= isl_map_reverse(proj
);
2008 shared_domain
= isl_set_apply(shared_domain
,
2009 isl_map_copy(proj
));
2010 sched
= isl_union_map_apply_range(sched
,
2011 isl_union_map_from_map(proj
));
2014 for (i
= 0; i
< gen
->n_array
; ++i
) {
2015 struct cuda_array_info
*array
= &gen
->array
[i
];
2017 if (gen
->array
[i
].print_shared_level
!= level
)
2020 for (j
= 0; j
< array
->n_group
; ++j
)
2021 print_group_private_accesses(gen
, array
->groups
[j
],
2022 type
, shared_domain
,
2023 first_shared
, shared_len
, sched
);
2026 isl_union_map_free(sched
);
2027 isl_set_free(shared_domain
);
2030 /* Set unroll[j] if the input dimension j is involved in
2031 * the index expression represented by bmap.
2033 static int check_unroll(__isl_take isl_basic_map
*bmap
, void *user
)
2036 int n_in
= isl_basic_map_dim(bmap
, isl_dim_in
);
2037 int n_out
= isl_basic_map_dim(bmap
, isl_dim_out
);
2040 for (i
= 0; i
< n_out
; ++i
) {
2044 ok
= isl_basic_map_has_defining_equality(bmap
,
2045 isl_dim_out
, i
, &c
);
2047 for (j
= 0; j
< n_in
; ++j
)
2048 if (isl_constraint_involves_dims(c
, isl_dim_in
, j
, 1))
2050 isl_constraint_free(c
);
2053 isl_basic_map_free(bmap
);
2057 /* Given an array pos mapping input dimensions to the corresponding
2058 * output dimension, construct the corresponding map.
2060 static __isl_give isl_map
*permutation(__isl_take isl_dim
*dim
,
2065 isl_basic_map
*bmap
;
2067 dim
= isl_dim_add(dim
, isl_dim_in
, len
);
2068 dim
= isl_dim_add(dim
, isl_dim_out
, len
);
2069 bmap
= isl_basic_map_universe(isl_dim_copy(dim
));
2071 for (i
= 0; i
< len
; ++i
) {
2072 c
= isl_equality_alloc(isl_dim_copy(dim
));
2073 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, -1);
2074 isl_constraint_set_coefficient_si(c
, isl_dim_out
, pos
[i
], 1);
2075 bmap
= isl_basic_map_add_constraint(bmap
, c
);
2079 return isl_map_from_basic_map(bmap
);
2082 /* Find all loops involved in any of the index expressions for any of
2083 * the private accesses, move them innermost and then mark them as
2084 * requiring unrolling by setting gen->first_unroll.
2085 * The loops involved should all be parallel because of the checks
2086 * we performed in check_private_group_access. Moving them innermost
2087 * is therefore a valid transformation.
2089 static __isl_give isl_union_map
*interchange_for_unroll(struct cuda_gen
*gen
,
2090 __isl_take isl_union_map
*sched
)
2093 int unroll
[gen
->thread_tiled_len
];
2094 int perm
[gen
->thread_tiled_len
];
2097 int len
= gen
->shared_len
+ gen
->n_parallel
+ gen
->n_block
;
2099 gen
->first_unroll
= -1;
2101 for (i
= 0; i
< gen
->thread_tiled_len
; ++i
)
2103 for (i
= 0; i
< gen
->n_array
; ++i
) {
2104 struct cuda_array_info
*array
= &gen
->array
[i
];
2106 for (j
= 0; j
< array
->n_group
; ++j
) {
2107 isl_union_map
*access
;
2110 if (!array
->groups
[j
]->private_bound
)
2113 access
= group_access_relation(array
->groups
[j
], 1, 1);
2114 access
= isl_union_map_apply_domain(access
,
2115 isl_union_map_copy(sched
));
2117 acc
= isl_map_from_union_map(access
);
2118 isl_map_foreach_basic_map(acc
, &check_unroll
, unroll
);
2124 for (i
= 0; i
< gen
->shared_len
; ++i
)
2128 for (i
= gen
->shared_len
; i
< len
; ++i
)
2135 for (i
= len
; i
< gen
->thread_tiled_len
; ++i
)
2140 for (i
= 0; i
< gen
->thread_tiled_len
; ++i
)
2143 gen
->first_unroll
= 1 + j
;
2144 for (i
= 0; i
< len
; ++i
)
2148 dim
= isl_union_map_get_dim(sched
);
2149 permute
= permutation(dim
, perm
, gen
->thread_tiled_len
);
2150 sched
= isl_union_map_apply_range(sched
,
2151 isl_union_map_from_map(permute
));
2156 /* This function is called for each leaf in the clast of the kernel code.
2157 * We first specialize the schedule to the site of the leaf and
2158 * print code for reading into shared memory, performing the actual
2159 * computations and writing from shared memory, with the required
2162 static void print_kernel_user(struct gpucode_info
*code
,
2163 struct clast_user_stmt
*u
)
2165 struct cuda_gen
*gen
= code
->user
;
2166 isl_set
*shared_domain
;
2168 shared_domain
= extract_entire_host_domain(u
);
2170 print_shared_accesses(gen
, shared_domain
, gen
->read
, "read", -1);
2172 print_private_accesses(gen
, shared_domain
, gen
->read
, "read", -1);
2174 print_shared_body(gen
, shared_domain
, gen
->local_sched
,
2175 gen
->thread_tiled_len
, &print_statement
,
2178 print_private_accesses(gen
, shared_domain
, gen
->write
, "write", -1);
2180 print_indent(gen
->cuda
.kernel_c
, gen
->kernel_code
.indent
);
2181 fprintf(gen
->cuda
.kernel_c
, "__syncthreads();\n");
2183 print_shared_accesses(gen
, shared_domain
, gen
->write
, "write", -1);
2185 isl_set_free(shared_domain
);
2188 /* Check if we need to perform any copying to shared memory at this level
2189 * and if so, print the copying instructions.
2190 * Any array for which we are allowed to print copying instructions at
2191 * this level, but haven't done so already, is printed.
2193 static void print_kernel_for_head(struct gpucode_info
*code
,
2194 struct clast_for
*f
)
2197 struct cuda_gen
*gen
= code
->user
;
2202 domain
= isl_set_from_cloog_domain(cloog_domain_copy(f
->domain
));
2203 level
= isl_set_dim(domain
, isl_dim_set
) - 1;
2205 for (i
= 0; i
< gen
->n_array
; ++i
) {
2206 if (gen
->array
[i
].print_shared_level
>= 0)
2208 if (gen
->array
[i
].last_shared
> level
)
2210 gen
->array
[i
].print_shared_level
= level
;
2215 print_shared_accesses(gen
, domain
, gen
->read
, "read", level
);
2216 print_private_accesses(gen
, domain
, gen
->read
, "read", level
);
2219 isl_set_free(domain
);
2222 /* Print instructions for copying from shared memory for each array
2223 * for which print_kernel_for_head has added copying instructions
2226 static void print_kernel_for_foot(struct gpucode_info
*code
,
2227 struct clast_for
*f
)
2230 struct cuda_gen
*gen
= code
->user
;
2235 domain
= isl_set_from_cloog_domain(cloog_domain_copy(f
->domain
));
2236 level
= isl_set_dim(domain
, isl_dim_set
) - 1;
2238 for (i
= 0; i
< gen
->n_array
; ++i
) {
2239 if (gen
->array
[i
].print_shared_level
!= level
)
2246 print_private_accesses(gen
, domain
, gen
->write
, "write", level
);
2247 print_shared_accesses(gen
, domain
, gen
->write
, "write", level
);
2250 isl_set_free(domain
);
2253 /* Use CLooG to generate code for the outer gen->shared_first loops
2254 * of the local schedule "sched".
2255 * The pretty printing of this code is handled by gpu_print_host_stmt,
2256 * which calls print_kernel_user for each iteration of the shared tile loops.
2258 static void print_cloog_kernel_body(struct cuda_gen
*gen
,
2259 __isl_keep isl_set
*context
, __isl_keep isl_union_map
*sched
)
2262 CloogOptions
*options
;
2263 CloogDomain
*cloog_context
;
2264 CloogUnionDomain
*ud
;
2266 struct clast_stmt
*stmt
;
2269 sched
= isl_union_map_copy(sched
);
2270 sched
= isl_union_map_align_params(sched
, isl_set_get_dim(context
));
2272 options
= cloog_options_malloc(gen
->state
);
2273 options
->language
= LANGUAGE_C
;
2274 options
->strides
= 1;
2276 options
->stop
= gen
->shared_len
;
2277 options
->f
= gen
->tiled_len
;
2278 options
->l
= gen
->tiled_len
;
2279 options
->save_domains
= 1;
2280 options
->noscalars
= 1;
2282 ud
= cloog_union_domain_from_isl_union_map(sched
);
2283 for (i
= 0; i
< gen
->shared_len
; ++i
) {
2284 snprintf(name
, sizeof(name
), "g%d", i
);
2285 ud
= cloog_union_domain_set_name(ud
, CLOOG_SCAT
, i
, name
);
2287 cloog_context
= cloog_domain_from_isl_set(isl_set_copy(context
));
2288 input
= cloog_input_alloc(cloog_context
, ud
);
2290 stmt
= cloog_clast_create_from_input(input
, options
);
2292 gen
->kernel_code
.indent
= 4;
2293 gen
->kernel_code
.dst
= gen
->cuda
.kernel_c
;
2294 gen
->kernel_code
.print_user_stmt
= NULL
;
2295 gen
->kernel_code
.print_user_stmt_list
= &print_kernel_user
;
2296 gen
->kernel_code
.print_for_head
= &print_kernel_for_head
;
2297 gen
->kernel_code
.print_for_foot
= &print_kernel_for_foot
;
2298 gen
->kernel_code
.user
= gen
;
2299 gpu_print_host_stmt(&gen
->kernel_code
, stmt
);
2301 cloog_clast_free(stmt
);
2302 cloog_options_free(options
);
2305 static void print_kernel_iterators(struct cuda_gen
*gen
)
2308 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
2309 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
2312 if (gen
->n_grid
> 0) {
2313 print_indent(gen
->cuda
.kernel_c
, 4);
2314 fprintf(gen
->cuda
.kernel_c
, "int ");
2315 for (i
= 0; i
< gen
->n_grid
; ++i
) {
2317 fprintf(gen
->cuda
.kernel_c
, ", ");
2318 fprintf(gen
->cuda
.kernel_c
, "b%d = %s",
2319 i
, block_dims
[gen
->n_grid
- 1 - i
]);
2321 fprintf(gen
->cuda
.kernel_c
, ";\n");
2324 if (gen
->n_block
> 0) {
2325 print_indent(gen
->cuda
.kernel_c
, 4);
2326 fprintf(gen
->cuda
.kernel_c
, "int ");
2327 for (i
= 0; i
< gen
->n_block
; ++i
) {
2329 fprintf(gen
->cuda
.kernel_c
, ", ");
2330 fprintf(gen
->cuda
.kernel_c
, "t%d = %s",
2331 i
, thread_dims
[gen
->n_block
- 1 - i
]);
2333 fprintf(gen
->cuda
.kernel_c
, ";\n");
2337 static void print_group_shared_array(struct cuda_gen
*gen
,
2338 struct cuda_array_ref_group
*group
)
2341 struct cuda_array_bound
*bounds
;
2343 bounds
= group
->private_bound
;
2345 bounds
= group
->shared_bound
;
2349 print_indent(gen
->cuda
.kernel_c
, 4);
2350 fprintf(gen
->cuda
.kernel_c
, "%s%s ",
2351 group
->private_bound
? "" : "__shared__ ", gen
->options
->type
);
2352 print_array_name(gen
->cuda
.kernel_c
, group
);
2353 for (j
= 0; j
< group
->array
->n_index
; ++j
) {
2354 fprintf(gen
->cuda
.kernel_c
, "[");
2355 isl_int_print(gen
->cuda
.kernel_c
, bounds
[j
].size
, 0);
2356 fprintf(gen
->cuda
.kernel_c
, "]");
2358 fprintf(gen
->cuda
.kernel_c
, ";\n");
2361 static void print_shared_arrays(struct cuda_gen
*gen
)
2365 for (i
= 0; i
< gen
->n_array
; ++i
) {
2366 struct cuda_array_info
*array
= &gen
->array
[i
];
2368 for (j
= 0; j
< array
->n_group
; ++j
)
2369 print_group_shared_array(gen
, array
->groups
[j
]);
2373 static void print_kernel_body(struct cuda_gen
*gen
,
2374 __isl_keep isl_set
*host_domain
, __isl_keep isl_union_map
*sched
)
2378 context
= isl_set_copy(host_domain
);
2379 context
= parametrize(context
, 0, gen
->tile_first
, "h");
2380 context
= isl_set_project_out(context
, isl_dim_set
, 0, gen
->tile_first
);
2381 context
= add_bounded_parameters(context
,
2382 gen
->n_grid
, gen
->grid_dim
, "b");
2384 print_kernel_iterators(gen
);
2385 print_shared_arrays(gen
);
2387 fprintf(gen
->cuda
.kernel_c
, "\n");
2389 print_cloog_kernel_body(gen
, context
, sched
);
2391 isl_set_free(context
);
2394 /* Given a constraint
2396 * a(p,i) + j = g f(e)
2398 * or -a(p,i) - j = g f(e) if sign < 0,
2399 * store a(p,i) in bound->shift and g (stride) in bound->stride.
2400 * a(p,i) is assumed to be an expression in only the parameters.
2402 static void extract_stride(__isl_keep isl_constraint
*c
,
2403 struct cuda_array_bound
*bound
, isl_int stride
, int sign
)
2410 isl_qpolynomial
*qp
;
2412 isl_int_set(bound
->stride
, stride
);
2414 dim
= isl_constraint_get_dim(c
);
2415 dim
= isl_dim_drop(dim
, isl_dim_out
, 0, 1);
2416 dim
= isl_dim_drop(dim
, isl_dim_in
, 0, isl_dim_size(dim
, isl_dim_in
));
2417 dim
= isl_dim_domain(dim
);
2419 nparam
= isl_dim_size(dim
, isl_dim_param
);
2423 isl_int_set_si(one
, 1);
2425 isl_constraint_get_constant(c
, &v
);
2428 qp
= isl_qpolynomial_rat_cst(isl_dim_copy(dim
), v
, one
);
2430 for (i
= 0; i
< nparam
; ++i
) {
2431 isl_qpolynomial
*t
, *p
;
2433 isl_constraint_get_coefficient(c
, isl_dim_param
, i
, &v
);
2434 if (isl_int_is_zero(v
))
2438 t
= isl_qpolynomial_rat_cst(isl_dim_copy(dim
), v
, one
);
2439 p
= isl_qpolynomial_var(isl_dim_copy(dim
), isl_dim_param
, i
);
2440 t
= isl_qpolynomial_mul(t
, p
);
2441 qp
= isl_qpolynomial_add(qp
, t
);
2451 /* Given an equality constraint of a map with a single output dimension j,
2452 * check if the constraint is of the form
2454 * a(p,i) + j = g f(e)
2456 * with a(p,i) an expression in the parameters and input dimensions
2457 * and f(e) an expression in the existentially quantified variables.
2458 * If so, and if g is larger than any such g from a previously considered
2459 * constraint, then call extract_stride. to record the stride information
2462 static int check_stride_constraint(__isl_take isl_constraint
*c
, void *user
)
2467 struct cuda_array_bound
*bound
= user
;
2470 isl_int_init(stride
);
2472 n_div
= isl_constraint_dim(c
, isl_dim_div
);
2473 isl_constraint_get_coefficient(c
, isl_dim_out
, 0, &v
);
2475 if (n_div
&& (isl_int_is_one(v
) || isl_int_is_negone(v
))) {
2476 int s
= isl_int_sgn(v
);
2477 isl_int_set_si(stride
, 0);
2478 for (i
= 0; i
< n_div
; ++i
) {
2479 isl_constraint_get_coefficient(c
, isl_dim_div
, i
, &v
);
2480 isl_int_gcd(stride
, stride
, v
);
2482 if (!isl_int_is_zero(stride
) &&
2483 isl_int_gt(stride
, bound
->stride
))
2484 extract_stride(c
, bound
, stride
, s
);
2487 isl_int_clear(stride
);
2490 isl_constraint_free(c
);
2494 /* Given contraints on an array index i, check if we can find
2495 * a shift a(p) and a stride g such that
2497 * a(p) + i = 0 mod g
2499 * If so, record the information in bound and apply the mapping
2500 * i -> (i + a(p))/g to the array index in bounds and return
2501 * the new constraints.
2502 * If not, simply return the original constraints.
2504 static __isl_give isl_basic_map
*check_stride(struct cuda_gen
*gen
,
2505 struct cuda_array_bound
*bound
, __isl_take isl_basic_map
*bounds
)
2509 isl_basic_map
*shift
;
2510 isl_qpolynomial
*qp
, *t
;
2513 isl_int_set_si(bound
->stride
, -1);
2515 aff
= isl_basic_map_affine_hull(isl_basic_map_copy(bounds
));
2517 isl_basic_map_foreach_constraint(aff
, &check_stride_constraint
, bound
);
2519 isl_basic_map_free(aff
);
2521 if (isl_int_is_neg(bound
->stride
))
2524 qp
= isl_qpolynomial_copy(bound
->shift
);
2525 qp
= isl_qpolynomial_add_dims(qp
, isl_dim_set
, 1);
2526 dim
= isl_qpolynomial_get_dim(qp
);
2527 t
= isl_qpolynomial_var(isl_dim_copy(dim
), isl_dim_set
, 0);
2528 qp
= isl_qpolynomial_add(qp
, t
);
2530 isl_int_set_si(one
, 1);
2531 t
= isl_qpolynomial_rat_cst(dim
, one
, bound
->stride
);
2533 qp
= isl_qpolynomial_mul(qp
, t
);
2534 shift
= isl_basic_map_from_qpolynomial(qp
);
2536 bound
->shift_map
= isl_basic_map_copy(shift
);
2537 bounds
= isl_basic_map_apply_range(bounds
, shift
);
2542 struct cuda_size_info
{
2543 isl_basic_set
*bset
;
2544 struct cuda_array_bound
*bound
;
2548 /* Given a constraint from the basic set describing the bounds on
2549 * an array index, check if it is a lower bound, say m i >= b(x), and,
2550 * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2551 * upper bound. If so, and if this bound is smaller than any bound
2552 * derived from earlier constraints, set the size to this bound on
2553 * the expression and the lower bound to ceil(b(x)/m).
2555 static int compute_size_in_direction(__isl_take isl_constraint
*c
, void *user
)
2557 struct cuda_size_info
*size
= user
;
2562 nparam
= isl_basic_set_dim(size
->bset
, isl_dim_param
);
2563 n_div
= isl_constraint_dim(c
, isl_dim_div
);
2565 if (isl_constraint_involves_dims(c
, isl_dim_div
, 0, n_div
)) {
2566 isl_constraint_free(c
);
2572 isl_constraint_get_coefficient(c
, isl_dim_set
, size
->pos
, &v
);
2574 if (isl_int_is_pos(v
)) {
2577 enum isl_lp_result res
;
2579 aff
= isl_constraint_get_bound(c
, isl_dim_set
, size
->pos
);
2580 aff
= isl_aff_ceil(aff
);
2582 lb
= isl_aff_copy(aff
);
2584 aff
= isl_aff_neg(aff
);
2585 aff
= isl_aff_add_coefficient_si(aff
, isl_dim_set
, size
->pos
, 1);
2587 res
= isl_basic_set_max(size
->bset
, aff
, &v
);
2590 if (res
== isl_lp_ok
) {
2591 isl_int_add_ui(v
, v
, 1);
2592 if (isl_int_is_neg(size
->bound
->size
) ||
2593 isl_int_lt(v
, size
->bound
->size
)) {
2594 isl_int_set(size
->bound
->size
, v
);
2595 lb
= isl_aff_drop_dims(lb
, isl_dim_set
,
2597 isl_aff_free(size
->bound
->lb
);
2598 size
->bound
->lb
= isl_aff_copy(lb
);
2605 isl_constraint_free(c
);
2610 /* Given a basic map "bounds" that maps parameters and input dimensions
2611 * to a single output dimension, look for an expression in the parameters
2612 * and input dimensions such that the range of the output dimension shifted
2613 * by this expression is a constant.
2615 * In particular, we currently only consider lower bounds on the output
2616 * dimension as candidate expressions.
2618 static int compute_array_dim_size(struct cuda_gen
*gen
,
2619 struct cuda_array_bound
*bound
, __isl_take isl_basic_map
*bounds
)
2621 struct cuda_size_info size
;
2623 bounds
= check_stride(gen
, bound
, bounds
);
2625 isl_int_set_si(bound
->size
, -1);
2629 size
.pos
= isl_basic_map_dim(bounds
, isl_dim_in
);
2630 size
.bset
= isl_basic_map_wrap(bounds
);
2631 size
.bset
= isl_basic_set_flatten(size
.bset
);
2632 isl_basic_set_foreach_constraint(size
.bset
, &compute_size_in_direction
,
2634 isl_basic_set_free(size
.bset
);
2636 return isl_int_is_nonneg(bound
->size
) ? 0 : -1;
2639 /* Check if we can find a shared memory tile for the given array
2640 * based on the given accesses, and if so, put the results
2641 * in array->shared_bound.
2643 * We project the accesses on each index in turn and look for a parametric
2644 * offset such that the size is constant.
2646 static int can_tile_for_shared_memory(struct cuda_gen
*gen
,
2647 struct cuda_array_info
*array
, __isl_keep isl_map
*access
,
2648 struct cuda_array_bound
*bounds
)
2652 for (i
= 0; i
< array
->n_index
; ++i
) {
2654 isl_basic_map
*hull
;
2656 access_i
= isl_map_copy(access
);
2657 access_i
= isl_map_project_out(access_i
, isl_dim_out
, 0, i
);
2658 access_i
= isl_map_project_out(access_i
, isl_dim_out
,
2659 1, array
->n_index
- (i
+ 1));
2660 access_i
= isl_map_compute_divs(access_i
);
2661 hull
= isl_map_simple_hull(access_i
);
2662 if (compute_array_dim_size(gen
, &bounds
[i
], hull
) < 0)
2669 /* Construct a map with input the shared tile loops and the loops that
2670 * will be wrapped around the threads that relates these later loops
2671 * to the thread indices and the projects them out.
2673 static __isl_give isl_map
*compute_privatization(struct cuda_gen
*gen
)
2681 dim
= isl_union_map_get_dim(gen
->shared_sched
);
2683 if (gen
->options
->wrap
)
2684 tiling
= wrap(isl_dim_copy(dim
), gen
->shared_len
+ gen
->n_block
,
2685 gen
->shared_len
, gen
->n_block
, gen
->block_dim
);
2687 tiling
= tile(isl_dim_copy(dim
), gen
->shared_len
+ gen
->n_block
,
2688 gen
->shared_len
, gen
->n_block
, gen
->block_dim
);
2692 par
= parametrization(dim
, gen
->shared_len
+ 2 * gen
->n_block
,
2693 gen
->tile_first
+ gen
->tile_len
+ gen
->n_grid
+ gen
->n_block
,
2696 priv
= isl_map_align_params(priv
, isl_set_get_dim(par
));
2697 priv
= isl_map_intersect_range(priv
, par
);
2699 dim
= isl_map_get_dim(priv
);
2700 dim
= isl_dim_drop(dim
, isl_dim_in
, 0, isl_dim_size(dim
, isl_dim_in
));
2701 dim
= isl_dim_drop(dim
, isl_dim_out
, 0, isl_dim_size(dim
, isl_dim_out
));
2702 proj
= projection(dim
, gen
->shared_len
+ 2 * gen
->n_block
,
2705 priv
= isl_map_apply_range(priv
, proj
);
2710 /* Construct a map from domain_dim to domain_dim that increments
2711 * the dimension at position "pos" and leaves all other dimensions
2714 static __isl_give isl_map
*next(__isl_take isl_dim
*domain_dim
, int pos
)
2717 int len
= isl_dim_size(domain_dim
, isl_dim_set
);
2719 isl_basic_map
*next
;
2721 dim
= isl_dim_map_from_set(domain_dim
);
2722 next
= isl_basic_map_universe(isl_dim_copy(dim
));
2724 for (i
= 0; i
< len
; ++i
) {
2727 c
= isl_equality_alloc(isl_dim_copy(dim
));
2728 isl_constraint_set_coefficient_si(c
, isl_dim_in
, i
, 1);
2729 isl_constraint_set_coefficient_si(c
, isl_dim_out
, i
, -1);
2731 isl_constraint_set_constant_si(c
, 1);
2732 next
= isl_basic_map_add_constraint(next
, c
);
2737 return isl_map_from_basic_map(next
);
2740 /* Check if the given access is coalesced.
2741 * That is, check whether incrementing the dimension that will get
2742 * wrapped over the last thread index results in incrementing
2743 * the last array index.
2745 * This function is only called for access relations without reuse.
2747 static int access_is_coalesced(struct cuda_gen
*gen
,
2748 __isl_keep isl_union_map
*access
)
2751 isl_map
*access_map
;
2752 isl_map
*next_thread_x
;
2753 isl_map
*next_element
;
2757 access
= isl_union_map_copy(access
);
2758 access
= isl_union_map_apply_domain(access
,
2759 isl_union_map_copy(gen
->tiled_sched
));
2760 access_map
= isl_map_from_union_map(access
);
2762 dim
= isl_map_get_dim(access_map
);
2763 dim
= isl_dim_domain(dim
);
2764 next_thread_x
= next(dim
, gen
->shared_len
+ gen
->n_block
- 1);
2766 dim
= isl_map_get_dim(access_map
);
2767 dim
= isl_dim_range(dim
);
2768 next_element
= next(dim
, isl_dim_size(dim
, isl_dim_set
) - 1);
2770 map
= isl_map_apply_domain(next_thread_x
, isl_map_copy(access_map
));
2771 map
= isl_map_apply_range(map
, access_map
);
2773 coalesced
= isl_map_is_subset(map
, next_element
);
2775 isl_map_free(next_element
);
2781 /* For the given array reference group, check whether the access is private
2782 * to the thread. That is, check that any given array element
2783 * is only accessed by a single thread.
2784 * We compute an access relation that maps the shared tile loop iterators
2785 * and the shared point loop iterators that will be wrapped over the
2786 * threads to the array elements.
2787 * We actually check that those iterators that will be wrapped
2788 * partition the array space. This check is stricter than necessary
2789 * since several iterations may be mapped onto the same thread
2790 * and then they could be allowed to access the same memory elements,
2791 * but our check does not allow this situation.
2793 * We also check that the index expression only depends on parallel
2794 * loops. That way, we can move those loops innermost and unroll them.
2795 * Again, we use a test that is stricter than necessary.
2796 * We actually check whether the index expression only depends
2797 * on the iterators that are wrapped over the threads.
2798 * These are necessarily parallel, but there may be more parallel loops.
2800 * Combining the injectivity of the first test with the single-valuedness
2801 * of the second test, we simply test for bijectivity.
2803 * If it turns out we can use registers, we compute the private memory
2804 * tile size using can_tile_for_shared_memory, after introducing a dependence
2805 * on the thread indices.
2807 * Before performing any of the above computations, we first check
2808 * if there is any reuse on the reference group. If not, we simply
2809 * return. If, moreover, the access is coalesced then we also remove
2810 * the shared memory tiling since we should just use global memory instead.
2812 static void check_private_group_access(struct cuda_gen
*gen
,
2813 struct cuda_array_ref_group
*group
)
2816 isl_union_map
*access
;
2817 int n_index
= group
->array
->n_index
;
2819 access
= group_access_relation(group
, 1, 1);
2820 if (isl_union_map_is_injective(access
)) {
2821 if (group
->shared_bound
&& access_is_coalesced(gen
, access
)) {
2822 free_bound_list(group
->shared_bound
, n_index
);
2823 group
->shared_bound
= NULL
;
2825 isl_union_map_free(access
);
2828 access
= isl_union_map_apply_domain(access
,
2829 isl_union_map_copy(gen
->shared_sched
));
2831 acc
= isl_map_from_union_map(access
);
2833 if (!isl_map_is_bijective(acc
)) {
2838 group
->private_bound
= create_bound_list(gen
->ctx
, n_index
);
2839 acc
= isl_map_align_params(acc
, isl_map_get_dim(gen
->privatization
));
2840 acc
= isl_map_apply_domain(acc
, isl_map_copy(gen
->privatization
));
2841 if (!can_tile_for_shared_memory(gen
, group
->array
, acc
,
2842 group
->private_bound
)) {
2843 free_bound_list(group
->private_bound
, n_index
);
2844 group
->private_bound
= NULL
;
2850 /* Look for the last shared tile loop that affects the offset of the
2851 * shared or private tile and store the result in array->last_shared.
2853 static void set_last_shared(struct cuda_gen
*gen
,
2854 struct cuda_array_ref_group
*group
)
2857 struct cuda_array_bound
*bounds
;
2858 unsigned first_shared
= gen
->first_shared
;
2859 int n_index
= group
->array
->n_index
;
2861 bounds
= group
->private_bound
;
2863 bounds
= group
->shared_bound
;
2867 for (j
= gen
->shared_len
- 1; j
>= 0; --j
) {
2868 for (i
= 0; i
< n_index
; ++i
) {
2870 isl_qpolynomial
*shift
;
2873 if (isl_aff_involves_dims(lb
, isl_dim_param
,
2874 first_shared
+ j
, 1))
2877 shift
= bounds
[i
].shift
;
2880 if (isl_qpolynomial_involves_dims(shift
, isl_dim_param
,
2881 first_shared
+ j
, 1))
2887 group
->array
->last_shared
= j
;
2890 /* Compute the sizes of all private arrays for the current kernel,
2891 * as well as the offsets of the private pieces in the original arrays.
2892 * If we cannot or don't want to privatize a given array group,
2893 * we use the shared memory tile sizes computed in
2894 * compute_group_shared_bound instead.
2896 * If a given Array only has a single reference group and if we have
2897 * been able to find a privated or shared tile,
2898 * we also look for the last shared tile loop that affects the offset
2899 * (and therefore the array tile) and store the result in array->last_shared.
2901 * A privatized copy of all access relations from reference groups that
2902 * are mapped to private memory is stored in gen->privatization.
2904 static void compute_private_size(struct cuda_gen
*gen
)
2907 isl_union_map
*private;
2909 private = isl_union_map_empty(isl_union_map_get_dim(gen
->shared_sched
));
2911 for (i
= 0; i
< gen
->n_array
; ++i
) {
2912 struct cuda_array_info
*array
= &gen
->array
[i
];
2914 for (j
= 0; j
< array
->n_group
; ++j
) {
2915 check_private_group_access(gen
, array
->groups
[j
]);
2917 if (!array
->groups
[j
]->private_bound
)
2920 private = isl_union_map_union(private,
2921 group_access_relation(array
->groups
[j
], 1, 1));
2924 array
->last_shared
= gen
->shared_len
- 1;
2925 array
->print_shared_level
= -1;
2927 if (array
->n_group
!= 1)
2929 set_last_shared(gen
, array
->groups
[0]);
2932 if (isl_union_map_is_empty(private))
2933 isl_union_map_free(private);
2935 isl_union_map
*priv
;
2937 private = isl_union_map_apply_domain(private,
2938 isl_union_map_copy(gen
->shared_sched
));
2939 priv
= isl_union_map_from_map(isl_map_copy(gen
->privatization
));
2940 private = isl_union_map_apply_domain(private, priv
);
2941 gen
->private_access
= private;
2945 /* Fill up the groups array with singleton groups, i.e., one group
2946 * per reference, initializing the array, access, write and refs fields.
2947 * In particular the access field is initialized to the scheduled
2948 * access relation of the array reference.
2950 * Return the number of elements initialized, i.e., the number of
2951 * active references in the current kernel.
2953 static int populate_array_references(struct cuda_gen
*gen
,
2954 struct cuda_array_info
*array
, __isl_keep isl_union_map
*sched
,
2955 struct cuda_array_ref_group
**groups
)
2959 isl_ctx
*ctx
= isl_union_map_get_ctx(sched
);
2962 for (i
= 0; i
< array
->n_ref
; ++i
) {
2963 isl_union_map
*umap
;
2965 struct cuda_array_ref_group
*group
;
2966 struct cuda_stmt_access
*access
= array
->refs
[i
];
2968 map
= isl_map_copy(access
->access
);
2969 umap
= isl_union_map_from_map(map
);
2970 umap
= isl_union_map_apply_domain(umap
,
2971 isl_union_map_copy(sched
));
2973 if (isl_union_map_is_empty(umap
)) {
2974 isl_union_map_free(umap
);
2978 map
= isl_map_from_union_map(umap
);
2980 group
= isl_calloc_type(ctx
, struct cuda_array_ref_group
);
2982 group
->array
= array
;
2983 group
->access
= map
;
2984 group
->write
= access
->write
;
2985 group
->refs
= &array
->refs
[i
];
2987 groups
[n
++] = group
;
2993 static void free_array_ref_group(struct cuda_array_ref_group
*group
,
2998 free_bound_list(group
->shared_bound
, n_index
);
2999 free_bound_list(group
->private_bound
, n_index
);
3000 isl_map_free(group
->access
);
3005 /* If two groups have overlapping access relations and if one of them
3006 * involves a write, then merge the two groups into one.
3008 * We keep track of the grouping in "leader". leader[j] points to
3009 * an earlier group array element that belongs to the same group,
3010 * or the array element j itself if this element is the first in the group.
3012 * Return the number of group leaders.
3014 static int group_overlapping_writes(int n
,
3015 struct cuda_array_ref_group
**groups
, int *leader
)
3020 for (i
= 0; i
< n
; ++i
) {
3022 groups
[l
]->n_ref
= 1;
3023 for (j
= i
- 1; j
>= 0; --j
) {
3029 if (!groups
[l
]->write
&& !groups
[j
]->write
)
3032 map
= isl_map_intersect(isl_map_copy(groups
[l
]->access
),
3033 isl_map_copy(groups
[j
]->access
));
3034 empty
= isl_map_is_empty(map
);
3040 groups
[j
]->access
= isl_map_union(groups
[j
]->access
,
3042 groups
[j
]->write
= 1;
3043 groups
[l
]->access
= NULL
;
3044 groups
[j
]->n_ref
+= groups
[l
]->n_ref
;
3054 /* Compute the size of the shared array corresponding to the given array
3055 * array refrence group, based on the accesses from the current kernel,
3056 * as well as the offset of the shared piece in the original array.
3058 static void compute_group_shared_bound(struct cuda_gen
*gen
,
3059 struct cuda_array_info
*array
, struct cuda_array_ref_group
*group
)
3061 isl_ctx
*ctx
= isl_dim_get_ctx(array
->dim
);
3063 group
->shared_bound
= create_bound_list(ctx
, array
->n_index
);
3064 if (!can_tile_for_shared_memory(gen
, array
, group
->access
,
3065 group
->shared_bound
)) {
3066 free_bound_list(group
->shared_bound
, array
->n_index
);
3067 group
->shared_bound
= NULL
;
3071 /* Given an initial grouping of array references and shared memory tiles
3072 * for each group that allows for a shared memory tile, merge two groups
3073 * if both have a shared memory tile and if the merged group also has
3074 * a shared memory tile.
3076 * Return the number of group leaders after merging.
3078 static int group_common_shared_memory_tile(struct cuda_gen
*gen
,
3079 struct cuda_array_info
*array
, int n
,
3080 struct cuda_array_ref_group
**groups
, int *leader
, int n_group
)
3083 isl_ctx
*ctx
= isl_dim_get_ctx(array
->dim
);
3085 for (i
= 0; n_group
> 1 && i
< n
; ++i
) {
3089 if (!groups
[i
]->shared_bound
)
3091 for (j
= i
- 1; j
>= 0; --j
) {
3094 struct cuda_array_bound
*shared_bound
;
3098 if (!groups
[j
]->shared_bound
)
3101 map
= isl_map_intersect(isl_map_copy(groups
[l
]->access
),
3102 isl_map_copy(groups
[j
]->access
));
3103 empty
= isl_map_is_empty(map
);
3109 map
= isl_map_union(isl_map_copy(groups
[l
]->access
),
3110 isl_map_copy(groups
[j
]->access
));
3111 shared_bound
= create_bound_list(ctx
, array
->n_index
);
3112 if (!can_tile_for_shared_memory(gen
, array
, map
,
3115 free_bound_list(shared_bound
, array
->n_index
);
3119 free_bound_list(groups
[j
]->shared_bound
,
3121 groups
[j
]->shared_bound
= shared_bound
;
3122 isl_map_free(groups
[j
]->access
);
3123 groups
[j
]->access
= map
;
3124 groups
[j
]->n_ref
+= groups
[l
]->n_ref
;
3133 /* Extract an array of array reference groups from the array of references
3134 * and the grouping information in "leader".
3136 * Store the results in array->n_group and array->groups.
3138 static void extract_array_groups(isl_ctx
*ctx
, struct cuda_array_info
*array
,
3139 int n
, struct cuda_array_ref_group
**groups
, int *leader
, int n_group
)
3143 for (i
= 2; i
< n
; ++i
)
3144 leader
[i
] = leader
[leader
[i
]];
3146 array
->n_group
= n_group
;
3147 array
->groups
= isl_alloc_array(ctx
, struct cuda_array_ref_group
*,
3149 assert(array
->groups
);
3152 for (i
= 0; i
< n
; ++i
) {
3154 struct cuda_stmt_access
**refs
;
3156 if (leader
[i
] != i
) {
3157 groups
[i
]->refs
= NULL
;
3158 free_array_ref_group(groups
[i
], array
->n_index
);
3162 refs
= isl_alloc_array(ctx
, struct cuda_stmt_access
*,
3166 for (k
= i
; k
< n
; ++k
)
3167 if (leader
[k
] == i
) {
3168 refs
[l
++] = *groups
[k
]->refs
;
3169 (*groups
[k
]->refs
)->group
= j
;
3172 groups
[i
]->refs
= refs
;
3174 array
->groups
[j
++] = groups
[i
];
3178 /* Group array references that should be considered together when
3179 * deciding whether to access them from private, shared or global memory.
3181 * In particular, if two array references overlap and if one of them
3182 * is a write, then the two references are grouped together.
3183 * Furthermore, if two groups admit a shared memory tile and if the
3184 * combination of the two also admits a shared memory tile, we merge
3187 * During the construction the group->refs field points to a single
3188 * array reference inside the array of array references, while
3189 * group->n_ref contains the number of element in leader that
3190 * (directly or indirectly) point to this group, provided the group
3193 static void group_array_references(struct cuda_gen
*gen
,
3194 struct cuda_array_info
*array
, __isl_keep isl_union_map
*sched
)
3198 isl_ctx
*ctx
= isl_union_map_get_ctx(sched
);
3199 struct cuda_array_ref_group
**groups
;
3202 groups
= isl_calloc_array(ctx
, struct cuda_array_ref_group
*,
3206 n
= populate_array_references(gen
, array
, sched
, groups
);
3208 leader
= isl_alloc_array(ctx
, int, n
);
3211 n_group
= group_overlapping_writes(n
, groups
, leader
);
3213 for (i
= 0; i
< n
; ++i
)
3215 compute_group_shared_bound(gen
, array
, groups
[i
]);
3217 n_group
= group_common_shared_memory_tile(gen
, array
, n
, groups
,
3220 extract_array_groups(ctx
, array
, n
, groups
, leader
, n_group
);
3226 /* Take tiled_sched, project it onto the shared tile loops and
3227 * the loops that will be wrapped over the threads,
3228 * parametrize the shared tile loops and store the result in gen->shared_sched.
3229 * The position of the first of these parameters is stored in gen->first_shared.
3230 * Also compute a projection that projects out the loops that will be
3231 * wrapped over the threads and store this projection in gen->shared_proj.
3233 static void compute_shared_sched(struct cuda_gen
*gen
)
3238 isl_union_map
*sched
;
3240 sched
= isl_union_map_copy(gen
->tiled_sched
);
3242 dim
= isl_union_map_get_dim(sched
);
3243 gen
->first_shared
= isl_dim_size(dim
, isl_dim_param
);
3244 proj
= projection(dim
, gen
->tiled_len
, gen
->shared_len
+ gen
->n_block
);
3245 sched
= isl_union_map_apply_range(sched
, isl_union_map_from_map(proj
));
3247 dim
= isl_union_map_get_dim(sched
);
3248 par
= parametrization(dim
, gen
->shared_len
+ gen
->n_block
,
3249 0, gen
->shared_len
, "g");
3250 sched
= isl_union_map_intersect_range(sched
,
3251 isl_union_set_from_set(par
));
3253 dim
= isl_union_map_get_dim(sched
);
3254 proj
= projection(dim
, gen
->shared_len
+ gen
->n_block
, gen
->shared_len
);
3256 gen
->shared_sched
= sched
;
3257 gen
->shared_proj
= isl_union_map_from_map(proj
);
3260 /* Group references of all arrays in the program.
3262 static void group_references(struct cuda_gen
*gen
)
3265 isl_union_map
*sched
;
3267 sched
= isl_union_map_apply_range(isl_union_map_copy(gen
->shared_sched
),
3268 isl_union_map_copy(gen
->shared_proj
));
3270 for (i
= 0; i
< gen
->n_array
; ++i
)
3271 group_array_references(gen
, &gen
->array
[i
], sched
);
3273 isl_union_map_free(sched
);
3276 /* Free all array information that is local to the current kernel.
3278 static void free_local_array_info(struct cuda_gen
*gen
)
3282 for (i
= 0; i
< gen
->n_array
; ++i
) {
3283 struct cuda_array_info
*array
= &gen
->array
[i
];
3285 for (j
= 0; j
< array
->n_group
; ++j
)
3286 free_array_ref_group(array
->groups
[j
], array
->n_index
);
3287 free(array
->groups
);
3289 if (array
->n_group
== 0)
3291 for (j
= 0; j
< gen
->array
[i
].n_index
; ++j
) {
3292 isl_pw_aff_free(gen
->array
[i
].local_bound
[j
]);
3293 gen
->array
[i
].local_bound
[j
] = NULL
;
3298 static void print_iterator_list(FILE *out
, int len
, const char *prefix
,
3304 for (i
= 0; i
< len
; ++i
) {
3308 fprintf(out
, "(%s%d)", prefix
, i
);
3310 fprintf(out
, "%s%d", prefix
, i
);
3315 /* Print an access to the element in the global memory copy of the
3316 * given array that corresponds to element [a0][a1]... of the original array.
3317 * The copy in global memory has been linearized, so we need to take
3318 * the array size into account.
3320 static void print_global_index(isl_ctx
*ctx
, FILE *out
,
3321 struct cuda_array_info
*array
)
3326 fprintf(out
, "%s[", array
->name
);
3327 for (i
= 0; i
+ 1 < array
->n_index
; ++i
)
3329 for (i
= 0; i
< array
->n_index
; ++i
) {
3331 prn
= isl_printer_to_file(ctx
, out
);
3332 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
3333 prn
= isl_printer_print_str(prn
, ") * (");
3334 prn
= isl_printer_print_pw_aff(prn
,
3335 array
->local_bound
[i
]);
3336 prn
= isl_printer_print_str(prn
, ") + ");
3337 isl_printer_free(prn
);
3339 fprintf(out
, "a%d", i
);
3344 /* Print an access to the element in the shared memory copy of the
3345 * given array that corresponds to element [a0][a1]... of the original array.
3346 * Since the array in shared memory is just a shifted copy of part
3347 * of the original array, we simply need to subtract the lower bound,
3348 * which was computed in can_tile_for_shared_memory.
3349 * If any of the indices is strided, then we first add
3350 * shared_bound[i].shift and divide by shared_bound[i].stride.
3352 static void print_local_index(FILE *out
, struct cuda_array_ref_group
*group
)
3357 struct cuda_array_bound
*bounds
= group
->shared_bound
;
3359 ctx
= isl_dim_get_ctx(group
->array
->dim
);
3360 print_array_name(out
, group
);
3361 for (i
= 0; i
< group
->array
->n_index
; ++i
) {
3362 fprintf(out
, "[(a%d", i
);
3363 if (bounds
[i
].shift
) {
3364 fprintf(out
, " + (");
3365 prn
= isl_printer_to_file(ctx
, out
);
3366 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
3367 prn
= isl_printer_print_qpolynomial(prn
,
3369 prn
= isl_printer_print_str(prn
, "))/");
3370 prn
= isl_printer_print_isl_int(prn
,
3372 isl_printer_free(prn
);
3375 fprintf(out
, " - (");
3376 prn
= isl_printer_to_file(ctx
, out
);
3377 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
3378 prn
= isl_printer_print_aff(prn
, bounds
[i
].lb
);
3379 isl_printer_free(prn
);
3384 /* Print '#define's for copying data from global memory to shared
3385 * memory and back for the given array.
3387 static void print_array_copy_defines(struct cuda_gen
*gen
,
3388 struct cuda_array_ref_group
*group
)
3391 const char *type
[] = { "read", "write" };
3392 struct cuda_array_info
*array
= group
->array
;
3393 int n_index
= array
->n_index
;
3395 for (i
= 0; i
< 2; ++i
) {
3396 fprintf(gen
->cuda
.kernel_c
, "#define %s_", type
[i
]);
3397 print_array_name(gen
->cuda
.kernel_c
, group
);
3398 print_iterator_list(gen
->cuda
.kernel_c
, n_index
, "a", 0);
3399 fprintf(gen
->cuda
.kernel_c
, " %s_", type
[i
]);
3400 print_array_name(gen
->cuda
.kernel_c
, group
);
3401 fprintf(gen
->cuda
.kernel_c
, "_");
3402 print_iterator_list(gen
->cuda
.kernel_c
, n_index
, "a", 1);
3403 fprintf(gen
->cuda
.kernel_c
, "\n");
3405 fprintf(gen
->cuda
.kernel_c
, "#define %s_", type
[i
]);
3406 print_array_name(gen
->cuda
.kernel_c
, group
);
3407 fprintf(gen
->cuda
.kernel_c
, "_");
3408 print_iterator_list(gen
->cuda
.kernel_c
, n_index
, "a", 0);
3410 fprintf(gen
->cuda
.kernel_c
, " ");
3411 print_global_index(gen
->ctx
, gen
->cuda
.kernel_c
, array
);
3412 fprintf(gen
->cuda
.kernel_c
, " = ");
3413 print_local_index(gen
->cuda
.kernel_c
, group
);
3415 fprintf(gen
->cuda
.kernel_c
, " ");
3416 print_local_index(gen
->cuda
.kernel_c
, group
);
3417 fprintf(gen
->cuda
.kernel_c
, " = ");
3418 print_global_index(gen
->ctx
, gen
->cuda
.kernel_c
, array
);
3420 fprintf(gen
->cuda
.kernel_c
, "\n");
3424 static void print_copy_defines(struct cuda_gen
*gen
)
3428 for (i
= 0; i
< gen
->n_array
; ++i
) {
3429 struct cuda_array_info
*array
= &gen
->array
[i
];
3431 for (j
= 0; j
< array
->n_group
; ++j
) {
3432 if (array
->groups
[j
]->private_bound
)
3434 if (!array
->groups
[j
]->shared_bound
)
3436 print_array_copy_defines(gen
, array
->groups
[j
]);
3441 /* The sizes of the arrays on the host that have been computed by
3442 * extract_array_info may depend on the parameters. Use the extra
3443 * constraints on the parameters that are valid at "host_domain"
3444 * to simplify these expressions.
3446 static void localize_bounds(struct cuda_gen
*gen
,
3447 __isl_keep isl_set
*host_domain
)
3453 context
= isl_set_copy(host_domain
);
3454 nvar
= isl_set_dim(host_domain
, isl_dim_set
);
3455 context
= isl_set_project_out(host_domain
, isl_dim_set
, 0, nvar
);
3457 for (i
= 0; i
< gen
->n_array
; ++i
) {
3458 struct cuda_array_info
*array
= &gen
->array
[i
];
3460 if (array
->n_group
== 0)
3463 for (j
= 0; j
< array
->n_index
; ++j
) {
3466 pwaff
= isl_pw_aff_copy(array
->bound
[j
]);
3467 pwaff
= isl_pw_aff_gist(pwaff
, isl_set_copy(context
));
3468 array
->local_bound
[j
] = pwaff
;
3471 isl_set_free(context
);
3474 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3475 * in the statement list u.
3476 * Because of the way the schedule is constructed, the other statements
3477 * in the list, if any, should have the same values for these properties.
3479 static void set_tile_len(struct cuda_gen
*gen
, struct clast_user_stmt
*u
)
3482 struct cuda_stmt
*stmt
;
3484 nr
= atoi(u
->statement
->name
+ 2);
3485 stmt
= &gen
->stmts
[nr
];
3487 gen
->tile_len
= stmt
->tile_len
;
3488 gen
->n_parallel
= stmt
->n_parallel
;
3491 /* This function is called for each leaf in the clast of the host code.
3492 * We first specialize the schedule to the site of the leaf, compute
3493 * the size of shared memory and then print the body of host code
3494 * and the associated kernel (through a call to print_kernel_body).
3496 static void print_host_user(struct gpucode_info
*code
,
3497 struct clast_user_stmt
*u
)
3499 struct cuda_gen
*gen
= code
->user
;
3502 isl_set
*host_domain
;
3503 isl_union_map
*access
;
3504 isl_union_map
*local_sched
;
3505 isl_union_set
*arrays
;
3507 set_tile_len(gen
, u
);
3510 host_domain
= extract_entire_host_domain(u
);
3512 local_sched
= isl_union_map_intersect_range(
3513 isl_union_map_copy(gen
->sched
),
3514 isl_union_set_from_set(extend(isl_set_copy(host_domain
),
3515 gen
->untiled_len
)));
3516 access
= isl_union_map_union(isl_union_map_copy(gen
->read
),
3517 isl_union_map_copy(gen
->write
));
3518 access
= isl_union_map_apply_domain(access
,
3519 isl_union_map_copy(local_sched
));
3520 arrays
= isl_union_map_range(access
);
3522 print_indent(code
->dst
, code
->indent
);
3523 fprintf(code
->dst
, "dim3 k%d_dimBlock(", gen
->kernel_id
);
3524 print_reverse_list(code
->dst
, gen
->n_block
, gen
->block_dim
);
3525 fprintf(code
->dst
, ");\n");
3527 print_indent(code
->dst
, code
->indent
);
3528 fprintf(code
->dst
, "dim3 k%d_dimGrid(", gen
->kernel_id
);
3529 print_reverse_list(code
->dst
, gen
->n_grid
, gen
->grid_dim
);
3530 fprintf(code
->dst
, ");\n");
3532 gen
->tiled_sched
= tile_schedule(gen
, local_sched
);
3533 gen
->tiled_sched
= parametrize_tiled_schedule(gen
, gen
->tiled_sched
);
3534 gen
->tiled_sched
= scale_tile_loops(gen
, gen
->tiled_sched
);
3536 gen
->local_sched
= isl_union_map_copy(gen
->tiled_sched
);
3538 dim
= isl_union_map_get_dim(gen
->local_sched
);
3539 par
= parametrization(dim
, gen
->tiled_len
, 0, gen
->shared_len
, "g");
3540 gen
->local_sched
= isl_union_map_intersect_range(gen
->local_sched
,
3541 isl_union_set_from_set(par
));
3543 gen
->local_sched
= thread_tile_schedule(gen
, gen
->local_sched
);
3544 gen
->local_sched
= scale_thread_tile_loops(gen
, gen
->local_sched
);
3546 gen
->private_access
= NULL
;
3547 compute_shared_sched(gen
);
3548 gen
->privatization
= compute_privatization(gen
);
3549 group_references(gen
);
3550 compute_private_size(gen
);
3551 localize_bounds(gen
, host_domain
);
3553 gen
->local_sched
= interchange_for_unroll(gen
, gen
->local_sched
);
3555 print_copy_defines(gen
);
3556 print_kernel_launch(gen
, arrays
);
3558 fprintf(gen
->cuda
.kernel_c
, "{\n");
3560 print_kernel_body(gen
, host_domain
, gen
->tiled_sched
);
3562 fprintf(gen
->cuda
.kernel_c
, "}\n");
3564 free_local_array_info(gen
);
3565 isl_map_free(gen
->privatization
);
3566 isl_union_map_free(gen
->private_access
);
3567 isl_union_map_free(gen
->local_sched
);
3568 isl_union_map_free(gen
->tiled_sched
);
3569 isl_union_map_free(gen
->shared_sched
);
3570 isl_union_map_free(gen
->shared_proj
);
3571 isl_union_set_free(arrays
);
3572 isl_set_free(host_domain
);
3574 free(gen
->tile_size
);
3578 /* Use CLooG to generate code for the outer gen->tile_first loops
3579 * of the global schedule in gen->sched.
3580 * The pretty printing of this code is handled by gpu_print_host_stmt,
3581 * which calls print_host_user for each kernel invocation location.
3583 static void print_cloog_host_code(struct cuda_gen
*gen
)
3587 isl_union_map
*sched
;
3588 CloogOptions
*options
;
3589 CloogDomain
*cloog_context
;
3590 CloogUnionDomain
*ud
;
3592 struct clast_stmt
*stmt
;
3595 options
= cloog_options_malloc(gen
->state
);
3596 options
->language
= LANGUAGE_C
;
3598 options
->strides
= 1;
3599 options
->stop
= gen
->tile_first
;
3600 options
->f
= gen
->untiled_len
;
3601 options
->l
= gen
->untiled_len
;
3602 options
->save_domains
= 1;
3603 options
->noscalars
= 1;
3605 sched
= isl_union_map_copy(gen
->sched
);
3606 ud
= cloog_union_domain_from_isl_union_map(sched
);
3607 for (i
= 0; i
< options
->stop
; ++i
) {
3608 snprintf(name
, sizeof(name
), "h%d", i
);
3609 ud
= cloog_union_domain_set_name(ud
, CLOOG_SCAT
, i
, name
);
3611 context
= isl_set_copy(gen
->context
);
3612 cloog_context
= cloog_domain_from_isl_set(context
);
3613 input
= cloog_input_alloc(cloog_context
, ud
);
3615 stmt
= cloog_clast_create_from_input(input
, options
);
3617 gen
->code
.indent
= 0;
3618 gen
->code
.dst
= gen
->cuda
.host_c
;
3619 gen
->code
.print_user_stmt
= NULL
;
3620 gen
->code
.print_user_stmt_list
= &print_host_user
;
3621 gen
->code
.print_for_head
= NULL
;
3622 gen
->code
.print_for_foot
= NULL
;
3623 gen
->code
.user
= gen
;
3624 gpu_print_host_stmt(&gen
->code
, stmt
);
3626 cloog_clast_free(stmt
);
3627 cloog_options_free(options
);
3630 void print_host_code(struct cuda_gen
*gen
)
3632 fprintf(gen
->cuda
.host_c
, "{\n");
3633 print_cloog_macros(gen
->cuda
.host_c
);
3634 print_cloog_macros(gen
->cuda
.kernel_c
);
3636 declare_device_arrays(gen
);
3638 allocate_device_arrays(gen
);
3639 copy_arrays_to_device(gen
);
3642 print_cloog_host_code(gen
);
3644 copy_arrays_from_device(gen
);
3645 free_device_arrays(gen
);
3647 fprintf(gen
->cuda
.host_c
, "}\n");
3650 __isl_give isl_set
*add_context_from_str(__isl_take isl_set
*set
,
3659 ctx
= isl_set_get_ctx(set
);
3660 context
= isl_set_read_from_str(ctx
, str
, -1);
3661 context
= isl_set_align_params(context
, isl_set_get_dim(set
));
3662 set
= isl_set_intersect(set
, context
);
3667 /* Convert scop->context to an isl_set.
3669 static __isl_give isl_set
*extract_context(isl_ctx
*ctx
, scoplib_scop_p scop
)
3673 dim
= isl_dim_set_alloc(ctx
, scop
->nb_parameters
, 0);
3674 dim
= set_dim_names(dim
, isl_dim_param
, scop
->parameters
);
3675 return scoplib_matrix_to_isl_set(scop
->context
, dim
);
3678 /* Return an array of cuda_stmt representing the statements in "scop".
3680 static struct cuda_stmt
*extract_stmts(isl_ctx
*ctx
, scoplib_scop_p scop
,
3681 __isl_keep isl_set
*context
)
3684 struct cuda_stmt
*stmts
;
3685 scoplib_statement_p stmt
= scop
->statement
;
3687 n
= scoplib_statement_number(scop
->statement
);
3688 stmts
= isl_calloc_array(ctx
, struct cuda_stmt
, n
);
3691 for (stmt
= scop
->statement
, n
= 0; stmt
; stmt
= stmt
->next
, n
++) {
3694 struct cuda_stmt
*s
= &stmts
[n
];
3696 snprintf(name
, sizeof(name
), "S_%d", n
);
3698 dim
= isl_dim_set_alloc(ctx
, scop
->nb_parameters
,
3699 stmt
->nb_iterators
);
3700 dim
= set_dim_names(dim
, isl_dim_param
, scop
->parameters
);
3701 dim
= set_dim_names(dim
, isl_dim_set
, stmt
->iterators
);
3702 dim
= isl_dim_set_tuple_name(dim
, isl_dim_set
, name
);
3703 dim
= set_dim_names(dim
, isl_dim_set
, stmt
->iterators
);
3704 s
->domain
= scoplib_matrix_list_to_isl_set(stmt
->domain
,
3706 s
->domain
= isl_set_intersect(s
->domain
, isl_set_copy(context
));
3707 s
->text
= strdup(stmt
->body
);
3708 stmt_extract_accesses(s
);
3714 /* Extract all the read and write accesses from "scop" and store
3715 * them in gen->read and gen->write.
3717 static void extract_accesses(struct cuda_gen
*gen
, scoplib_scop_p scop
)
3720 int n
= scoplib_statement_number(scop
->statement
);
3722 scoplib_statement_p stmt
;
3724 dim
= isl_set_get_dim(gen
->context
);
3725 gen
->write
= isl_union_map_empty(isl_dim_copy(dim
));
3726 gen
->read
= isl_union_map_empty(dim
);
3728 for (i
= 0, stmt
= scop
->statement
; i
< n
; ++i
, stmt
= stmt
->next
) {
3729 isl_union_map
*read_i
;
3730 isl_union_map
*write_i
;
3732 read_i
= scoplib_access_to_isl_union_map(stmt
->read
,
3733 isl_set_copy(gen
->stmts
[i
].domain
),
3735 write_i
= scoplib_access_to_isl_union_map(stmt
->write
,
3736 isl_set_copy(gen
->stmts
[i
].domain
),
3739 gen
->read
= isl_union_map_union(gen
->read
, read_i
);
3740 gen
->write
= isl_union_map_union(gen
->write
, write_i
);
3744 /* Extract and return the original schedule of the program from "scop".
3746 static isl_union_map
*extract_original_schedule(struct cuda_gen
*gen
,
3747 scoplib_scop_p scop
)
3750 int n
= scoplib_statement_number(scop
->statement
);
3752 isl_union_map
*sched
;
3753 scoplib_statement_p stmt
;
3755 dim
= isl_set_get_dim(gen
->context
);
3756 sched
= isl_union_map_empty(dim
);
3758 for (i
= 0, stmt
= scop
->statement
; i
< n
; ++i
, stmt
= stmt
->next
) {
3761 dim
= isl_set_get_dim(gen
->stmts
[i
].domain
);
3762 dim
= isl_dim_from_domain(dim
);
3763 dim
= isl_dim_add(dim
, isl_dim_out
, 2 * stmt
->nb_iterators
+ 1);
3764 sched_i
= scoplib_schedule_to_isl_map(stmt
->schedule
, dim
);
3766 sched
= isl_union_map_union(sched
,
3767 isl_union_map_from_map(sched_i
));
3773 /* Return the union of all iteration domains of the gen->stmts[i].
3775 static __isl_give isl_union_set
*extract_domain(struct cuda_gen
*gen
)
3778 isl_union_set
*domain
;
3780 domain
= isl_union_set_empty(isl_set_get_dim(gen
->context
));
3781 for (i
= 0; i
< gen
->n_stmts
; ++i
) {
3784 domain_i
= isl_set_copy(gen
->stmts
[i
].domain
);
3785 domain
= isl_union_set_union(domain
,
3786 isl_union_set_from_set(domain_i
));
3792 /* Information about the outermost tilable bands in the forest of bands.
3794 * tile_len and n_parallel are only sets on band_info structures
3795 * that correspond to outermost bands. For other bands (in particular,
3796 * ancestors of the outermost bands), n_parallal is set to 0.
3798 * prefix is the (padded) schedule leading up to the outermost tilable bands.
3800 * tile_first is the number of schedule dimensions in prefix.
3802 * suffix is the schedule of the outermost tilable bands and their descendants.
3805 struct cuda_gen
*gen
;
3809 isl_union_map
*prefix
;
3810 isl_union_map
*suffix
;
3813 /* Set tile_len and n_parallel of the statement to that of
3814 * their outermost band, recorded in the band_info.
3816 static int set_stmt_tile_len(__isl_take isl_map
*map
, void *user
)
3818 struct band_info
*info
= user
;
3820 struct cuda_stmt
*stmt
;
3822 nr
= atoi(isl_map_get_tuple_name(map
, isl_dim_in
) + 2);
3823 stmt
= &info
->gen
->stmts
[nr
];
3825 stmt
->tile_len
= info
->tile_len
;
3826 stmt
->n_parallel
= info
->n_parallel
;
3833 static void list_select_outer_band(struct cuda_gen
*gen
,
3834 __isl_take isl_band_list
*list
, int pos
, struct band_info
*list_info
);
3836 /* Check if this band has any parallel loops. If so, take it as
3837 * the outermost tilable band. If not, continue looking for the
3838 * outermost tilable band in the children of the current band.
3840 static void band_select_outer_band(struct cuda_gen
*gen
,
3841 __isl_take isl_band
*band
, int pos
, struct band_info
*info
)
3843 int n
= isl_band_n_member(band
);
3846 for (n_parallel
= 0; n_parallel
< n
; ++n_parallel
)
3847 if (!isl_band_member_is_zero_distance(band
, n_parallel
))
3850 info
->n_parallel
= n_parallel
;
3853 info
->tile_first
= pos
;
3855 info
->prefix
= isl_band_get_prefix_schedule(band
);
3856 info
->suffix
= isl_union_map_flat_range_product(
3857 isl_band_get_partial_schedule(band
),
3858 isl_band_get_suffix_schedule(band
));
3859 isl_union_map_foreach_map(info
->prefix
,
3860 &set_stmt_tile_len
, info
);
3862 isl_band_list
*children
;
3863 assert(isl_band_has_children(band
));
3864 children
= isl_band_get_children(band
);
3865 list_select_outer_band(gen
, children
, pos
+ n
, info
);
3868 isl_band_free(band
);
3871 /* Comparison function that returns a non-zero value for band_infos
3872 * with different tile_len fields or different n_parallel fields.
3874 static int cmp_band(const void *p1
, const void *p2
)
3876 const struct band_info
*info1
= p1
;
3877 const struct band_info
*info2
= p2
;
3879 if (info1
->tile_len
!= info2
->tile_len
)
3880 return info1
->tile_len
- info2
->tile_len
;
3882 return info1
->n_parallel
- info2
->n_parallel
;
3885 /* Extend "umap" with coordinates with fixed value "val"
3886 * to a total length of "dst_len", assuming the original dimension is "src_len".
3888 static __isl_give isl_union_map
*extend_range(__isl_take isl_union_map
*umap
,
3889 int src_len
, int dst_len
, int val
)
3895 dim
= isl_union_map_get_dim(umap
);
3896 map
= isl_map_reverse(projection(dim
, dst_len
, src_len
));
3897 for (i
= src_len
; i
< dst_len
; ++i
)
3898 map
= isl_map_fix_si(map
, isl_dim_out
, i
, val
);
3900 umap
= isl_union_map_apply_range(umap
, isl_union_map_from_map(map
));
3905 /* Group bands with the same values for tile_len and n_parallel.
3906 * The prefix schedule is then extended with a fixed coordinate that
3907 * is different for each such group.
3908 * Note that the actual values for this coordinate are not important.
3909 * The bands have already been effectively separated at a higher level
3910 * or they are independent and may be executed in parallel.
3911 * The list of band_info has been sorted before this functions is called.
3913 static void separate_bands(struct band_info
*info
, int n
)
3918 for (i
= 0; i
< n
; ++i
) {
3919 int l
= info
[i
].tile_first
;
3922 (info
[i
].tile_len
!= info
[i
- 1].tile_len
||
3923 info
[i
].n_parallel
!= info
[i
- 1].n_parallel
))
3926 info
[i
].prefix
= extend_range(info
[i
].prefix
,
3928 info
[i
].tile_first
= l
+ 1;
3932 /* Select the outermost bands in the elements of the list, align
3933 * their prefix schedules, separate bands with different values
3934 * for tile_len and/or n_parallel and then combine the resulting
3935 * prefix and suffix schedules into a single pair of prefix and
3936 * suffix schedules for the entire list.
3938 static void list_select_outer_band(struct cuda_gen
*gen
,
3939 __isl_take isl_band_list
*list
, int pos
, struct band_info
*list_info
)
3943 int n
= isl_band_list_n_band(list
);
3944 isl_ctx
*ctx
= isl_band_list_get_ctx(list
);
3945 struct band_info
*info
;
3947 isl_union_map
*prefix
;
3948 isl_union_map
*suffix
;
3951 info
= isl_calloc_array(ctx
, struct band_info
, n
);
3955 for (i
= 0; i
< n
; ++i
) {
3956 band
= isl_band_list_get_band(list
, i
);
3957 band_select_outer_band(gen
, band
, pos
, &info
[i
]);
3958 if (info
[i
].tile_first
> max_tile_first
)
3959 max_tile_first
= info
[i
].tile_first
;
3962 for (i
= 0; i
< n
; ++i
) {
3963 if (info
[i
].tile_first
== max_tile_first
)
3965 info
[i
].prefix
= extend_range(info
[i
].prefix
,
3966 info
[i
].tile_first
, max_tile_first
, 0);
3969 qsort(info
, n
, sizeof(struct band_info
), &cmp_band
);
3971 for (i
= 0; i
< n
- 1; ++i
)
3972 if (info
[i
].tile_len
!= info
[i
+ 1].tile_len
||
3973 info
[i
].n_parallel
!= info
[i
+ 1].n_parallel
)
3977 separate_bands(info
, n
);
3979 prefix
= info
[0].prefix
;
3980 suffix
= info
[0].suffix
;
3982 for (i
= 1; i
< n
; ++i
) {
3983 prefix
= isl_union_map_union(prefix
, info
[i
].prefix
);
3984 suffix
= isl_union_map_union(suffix
, info
[i
].suffix
);
3987 list_info
->tile_first
= info
[0].tile_first
;
3988 list_info
->tile_len
= -1;
3989 list_info
->prefix
= prefix
;
3990 list_info
->suffix
= suffix
;
3992 isl_band_list_free(list
);
3996 /* Set max_out to the maximal number of output dimensions over
3999 static int update_max_out(__isl_take isl_map
*map
, void *user
)
4001 int *max_out
= user
;
4002 int n_out
= isl_map_dim(map
, isl_dim_out
);
4004 if (n_out
> *max_out
)
4011 struct align_range_data
{
4016 /* Extend the dimension of the range of the given map to data->max_out and
4017 * then add the result to data->res.
4019 static int map_align_range(__isl_take isl_map
*map
, void *user
)
4021 struct align_range_data
*data
= user
;
4025 int n_out
= isl_map_dim(map
, isl_dim_out
);
4027 dim
= isl_union_map_get_dim(data
->res
);
4028 proj
= isl_map_reverse(projection(dim
, data
->max_out
, n_out
));
4029 for (i
= n_out
; i
< data
->max_out
; ++i
)
4030 proj
= isl_map_fix_si(proj
, isl_dim_out
, i
, 0);
4032 map
= isl_map_apply_range(map
, proj
);
4034 data
->res
= isl_union_map_add_map(data
->res
, map
);
4039 /* Extend the ranges of the maps in the union map such they all have
4040 * the same dimension.
4042 static __isl_give isl_union_map
*align_range(__isl_take isl_union_map
*umap
)
4044 struct align_range_data data
;
4047 isl_union_map_foreach_map(umap
, &update_max_out
, &data
.max_out
);
4049 data
.res
= isl_union_map_empty(isl_union_map_get_dim(umap
));
4050 isl_union_map_foreach_map(umap
, &map_align_range
, &data
);
4052 isl_union_map_free(umap
);
4056 /* Select the outermost tilable band that (by construction)
4057 * has at least one parallel loop.
4058 * The starting position of the aligned band is stored in the pair
4060 * The sizes and number of parallel loops may be different in different
4061 * parts of the band forest and are therefore stored in the cuda_stmts.
4063 * Return the complete schedule, with the tilable bands aligned
4064 * at gen->tile_first and padded with zero, if needed.
4066 static __isl_give isl_union_map
*select_outer_tilable_band(struct cuda_gen
*gen
,
4067 __isl_keep isl_schedule
*schedule
)
4069 isl_band_list
*list
;
4070 struct band_info info
;
4072 gen
->n_parallel
= 0;
4075 list
= isl_schedule_get_band_forest(schedule
);
4077 list_select_outer_band(gen
, list
, 0, &info
);
4079 gen
->tile_first
= info
.tile_first
;
4080 info
.suffix
= align_range(info
.suffix
);
4082 return isl_union_map_flat_range_product(info
.prefix
, info
.suffix
);
4085 /* Set gen->untiled_len to the number of scheduling dimensions
4086 * for the schedule of the first domain.
4087 * We assume here that this number is the same for all domains.
4089 static int set_untiled_len(__isl_take isl_map
*map
, void *user
)
4091 unsigned *untiled_len
= user
;
4093 *untiled_len
= isl_map_dim(map
, isl_dim_out
);
4099 /* Compute an appropriate schedule based on the accesses in
4100 * gen->read and gen->write.
4102 * We first compute dependences and then use those to compute
4103 * a schedule that has a parallel loop in each tilable band.
4104 * Finally, we select the outermost tilable band.
4106 static void compute_schedule(struct cuda_gen
*gen
,
4107 __isl_take isl_union_map
*sched
)
4109 isl_ctx
*ctx
= isl_union_map_get_ctx(sched
);
4110 isl_union_set
*domain
;
4111 isl_union_map
*empty
;
4112 isl_union_map
*dep_raw
, *dep2
, *dep3
, *dep
;
4113 isl_union_map
*uninitialized
;
4114 isl_schedule
*schedule
;
4115 struct isl_options
*options
;
4117 empty
= isl_union_map_empty(isl_union_map_get_dim(sched
));
4119 isl_union_map_compute_flow(isl_union_map_copy(gen
->read
),
4120 isl_union_map_copy(gen
->write
), empty
,
4121 isl_union_map_copy(sched
),
4122 &dep_raw
, NULL
, &uninitialized
, NULL
);
4123 isl_union_map_compute_flow(isl_union_map_copy(gen
->write
),
4124 isl_union_map_copy(gen
->write
),
4125 isl_union_map_copy(gen
->read
),
4126 isl_union_map_copy(sched
),
4127 &dep2
, &dep3
, NULL
, NULL
);
4128 isl_union_map_free(sched
);
4130 gen
->copy_in
= isl_union_map_range(uninitialized
);
4132 dep
= isl_union_map_union(dep2
, dep3
);
4133 dep
= isl_union_map_union(dep
, dep_raw
);
4134 dep
= isl_union_map_coalesce(dep
);
4136 domain
= extract_domain(gen
);
4137 options
= isl_ctx_peek_options(ctx
, isl_options_arg
);
4138 options
->schedule_outer_zero_distance
= 1;
4139 schedule
= isl_union_set_compute_schedule(isl_union_set_copy(domain
),
4140 isl_union_map_copy(dep
), dep
);
4142 sched
= select_outer_tilable_band(gen
, schedule
);
4144 isl_union_map_foreach_map(sched
, &set_untiled_len
, &gen
->untiled_len
);
4145 sched
= isl_union_map_intersect_domain(sched
, domain
);
4148 isl_schedule_free(schedule
);
4151 /* Replace the scop in the "input" file by equivalent code
4152 * that uses the GPU. "scop" is assumed to correspond to this scop.
4154 * We first compute a schedule that respects the dependences
4155 * of the original program and select the outermost band
4156 * of tilable dimensions that has at least one parallel loop.
4157 * We then have three blocks of dimensions
4161 * The tilable band "B" is first tiled according to "tile.sizes", resulting
4166 * For each iteration of the T loop and for each array, we compute
4167 * the array elements accessed by that iteration, construct a rectangular
4168 * box around it and shift it to the origin. The result is used
4169 * as shared memory for the array.
4171 * We then split off at most 2 parallel loops from the T loops and
4172 * at most 3 parallel loops from the P loops
4176 * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4177 * according to "grid.sizes"/"block.sizes".
4179 * H T1T T1P T2 P1T P1P P2 G
4181 * Finally, the T1P and P1P iterators are equated to the block and
4182 * thread dimensions respectively and so are effectively removed.
4183 * The H loops are run on the host. The T1T, T2, P1T, P2 and G loops
4184 * are run on the GPU.
4186 * Code is generated in three stages. We first generate code for the
4187 * host (the H loops), with iterators h%d. Then, for each leaf node
4188 * of the resulting AST, we generate code for the shared loops (up to
4189 * and including T2), with iterators g%d and after equating the H loops
4190 * to h%d parameters and the T1P loops to the block dimensions.
4191 * Finally, we generate code for the remaining loops in a similar fashion.
4193 * The function frees "scop" and "ctx".
4195 int cuda_scop(isl_ctx
*ctx
, scoplib_scop_p scop
, struct ppcg_options
*options
,
4198 isl_union_map
*sched
;
4199 struct cuda_gen gen
;
4202 gen
.context
= extract_context(gen
.ctx
, scop
);
4203 gen
.context
= add_context_from_str(gen
.context
, options
->ctx
);
4204 gen
.n_stmts
= scoplib_statement_number(scop
->statement
);
4205 gen
.stmts
= extract_stmts(gen
.ctx
, scop
, gen
.context
);
4206 extract_accesses(&gen
, scop
);
4207 gen
.options
= options
;
4208 gen
.state
= cloog_isl_state_malloc(gen
.ctx
);
4210 cuda_open_files(&gen
.cuda
, input
);
4212 collect_array_info(&gen
);
4214 sched
= extract_original_schedule(&gen
, scop
);
4215 compute_schedule(&gen
, sched
);
4217 print_host_code(&gen
);
4219 cloog_state_free(gen
.state
);
4220 clear_cuda_gen(&gen
);
4221 isl_ctx_free(gen
.ctx
);
4222 scoplib_scop_free(scop
);
4224 cuda_close_files(&gen
.cuda
);