1 /* Interchange heuristics and transform for loop interchange on
2 polyhedral representation.
4 Copyright (C) 2009-2014 Free Software Foundation, Inc.
5 Contributed by Sebastian Pop <sebastian.pop@amd.com> and
6 Harsha Jagasia <harsha.jagasia@amd.com>.
8 This file is part of GCC.
10 GCC is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 GCC is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with GCC; see the file COPYING3. If not see
22 <http://www.gnu.org/licenses/>. */
30 #include <isl/union_map.h>
33 #if defined(__cplusplus)
36 #include <isl/val_gmp.h>
37 #if defined(__cplusplus)
41 #include <cloog/cloog.h>
42 #include <cloog/isl/domain.h>
47 #include "coretypes.h"
49 #include "basic-block.h"
50 #include "tree-ssa-alias.h"
51 #include "internal-fn.h"
52 #include "gimple-expr.h"
55 #include "gimple-iterator.h"
56 #include "tree-ssa-loop.h"
59 #include "tree-chrec.h"
60 #include "tree-data-ref.h"
61 #include "tree-scalar-evolution.h"
65 #include "graphite-poly.h"
67 /* XXX isl rewrite following comment */
68 /* Builds a linear expression, of dimension DIM, representing PDR's
71 L = r_{n}*r_{n-1}*...*r_{1}*s_{0} + ... + r_{n}*s_{n-1} + s_{n}.
73 For an array A[10][20] with two subscript locations s0 and s1, the
74 linear memory access is 20 * s0 + s1: a stride of 1 in subscript s0
75 corresponds to a memory stride of 20.
77 OFFSET is a number of dimensions to prepend before the
78 subscript dimensions: s_0, s_1, ..., s_n.
80 Thus, the final linear expression has the following format:
81 0 .. 0_{offset} | 0 .. 0_{nit} | 0 .. 0_{gd} | 0 | c_0 c_1 ... c_n
82 where the expression itself is:
83 c_0 * s_0 + c_1 * s_1 + ... c_n * s_n. */
85 static isl_constraint
*
86 build_linearized_memory_access (isl_map
*map
, poly_dr_p pdr
)
89 isl_local_space
*ls
= isl_local_space_from_space (isl_map_get_space (map
));
90 unsigned offset
, nsubs
;
94 isl_val
*size
, *subsize
, *size1
;
96 res
= isl_equality_alloc (ls
);
97 ctx
= isl_local_space_get_ctx (ls
);
98 size
= isl_val_int_from_ui (ctx
, 1);
100 nsubs
= isl_set_dim (pdr
->extent
, isl_dim_set
);
101 /* -1 for the already included L dimension. */
102 offset
= isl_map_dim (map
, isl_dim_out
) - 1 - nsubs
;
103 res
= isl_constraint_set_coefficient_si (res
, isl_dim_out
, offset
+ nsubs
, -1);
104 /* Go through all subscripts from last to first. First dimension
105 is the alias set, ignore it. */
106 for (i
= nsubs
- 1; i
>= 1; i
--)
111 size1
= isl_val_copy (size
);
112 res
= isl_constraint_set_coefficient_val (res
, isl_dim_out
, offset
+ i
, size
);
113 dc
= isl_set_get_space (pdr
->extent
);
114 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
115 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, i
, 1);
116 subsize
= isl_set_max_val (pdr
->extent
, aff
);
118 size
= isl_val_mul (size1
, subsize
);
126 /* Set STRIDE to the stride of PDR in memory by advancing by one in
127 the loop at DEPTH. */
130 pdr_stride_in_loop (mpz_t stride
, graphite_dim_t depth
, poly_dr_p pdr
)
132 poly_bb_p pbb
= PDR_PBB (pdr
);
137 isl_constraint
*lma
, *c
;
139 graphite_dim_t time_depth
;
142 /* XXX isl rewrite following comments. */
143 /* Builds a partial difference equations and inserts them
144 into pointset powerset polyhedron P. Polyhedron is assumed
145 to have the format: T|I|T'|I'|G|S|S'|l1|l2.
147 TIME_DEPTH is the time dimension w.r.t. which we are
149 OFFSET represents the number of dimensions between
150 columns t_{time_depth} and t'_{time_depth}.
151 DIM_SCTR is the number of scattering dimensions. It is
152 essentially the dimensionality of the T vector.
154 The following equations are inserted into the polyhedron P:
157 | t_{time_depth-1} = t'_{time_depth-1}
158 | t_{time_depth} = t'_{time_depth} + 1
159 | t_{time_depth+1} = t'_{time_depth + 1}
161 | t_{dim_sctr} = t'_{dim_sctr}. */
163 /* Add the equality: t_{time_depth} = t'_{time_depth} + 1.
164 This is the core part of this alogrithm, since this
165 constraint asks for the memory access stride (difference)
166 between two consecutive points in time dimensions. */
171 | t_{time_depth-1} = t'_{time_depth-1}
172 | t_{time_depth+1} = t'_{time_depth+1}
174 | t_{dim_sctr} = t'_{dim_sctr}
176 This means that all the time dimensions are equal except for
177 time_depth, where the constraint is t_{depth} = t'_{depth} + 1
178 step. More to this: we should be careful not to add equalities
179 to the 'coupled' dimensions, which happens when the one dimension
180 is stripmined dimension, and the other dimension corresponds
181 to the point loop inside stripmined dimension. */
183 /* pdr->accesses: [P1..nb_param,I1..nb_domain]->[a,S1..nb_subscript]
184 ??? [P] not used for PDRs?
185 pdr->extent: [a,S1..nb_subscript]
186 pbb->domain: [P1..nb_param,I1..nb_domain]
187 pbb->transformed: [P1..nb_param,I1..nb_domain]->[T1..Tnb_sctr]
188 [T] includes local vars (currently unused)
190 First we create [P,I] -> [T,a,S]. */
192 map
= isl_map_flat_range_product (isl_map_copy (pbb
->transformed
),
193 isl_map_copy (pdr
->accesses
));
194 /* Add a dimension for L: [P,I] -> [T,a,S,L].*/
195 map
= isl_map_add_dims (map
, isl_dim_out
, 1);
196 /* Build a constraint for "lma[S] - L == 0", effectively calculating
197 L in terms of subscripts. */
198 lma
= build_linearized_memory_access (map
, pdr
);
199 /* And add it to the map, so we now have:
200 [P,I] -> [T,a,S,L] : lma([S]) == L. */
201 map
= isl_map_add_constraint (map
, lma
);
203 /* Then we create [P,I,P',I'] -> [T,a,S,L,T',a',S',L']. */
204 map
= isl_map_flat_product (map
, isl_map_copy (map
));
206 /* Now add the equality T[time_depth] == T'[time_depth]+1. This will
207 force L' to be the linear address at T[time_depth] + 1. */
208 time_depth
= psct_dynamic_dim (pbb
, depth
);
209 /* Length of [a,S] plus [L] ... */
210 offset
= 1 + isl_map_dim (pdr
->accesses
, isl_dim_out
);
212 offset
+= isl_map_dim (pbb
->transformed
, isl_dim_out
);
214 c
= isl_equality_alloc (isl_local_space_from_space (isl_map_get_space (map
)));
215 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
, time_depth
, 1);
216 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
,
217 offset
+ time_depth
, -1);
218 c
= isl_constraint_set_constant_si (c
, 1);
219 map
= isl_map_add_constraint (map
, c
);
221 /* Now we equate most of the T/T' elements (making PITaSL nearly
222 the same is (PITaSL)', except for one dimension, namely for 'depth'
223 (an index into [I]), after translating to index into [T]. Take care
224 to not produce an empty map, which indicates we wanted to equate
225 two dimensions that are already coupled via the above time_depth
226 dimension. Happens with strip mining where several scatter dimension
227 are interdependend. */
229 nt
= pbb_nb_scattering_transform (pbb
) + pbb_nb_local_vars (pbb
);
230 for (i
= 0; i
< nt
; i
++)
233 isl_map
*temp
= isl_map_equate (isl_map_copy (map
),
235 isl_dim_out
, offset
+ i
);
236 if (isl_map_is_empty (temp
))
245 /* Now maximize the expression L' - L. */
246 set
= isl_map_range (map
);
247 dc
= isl_set_get_space (set
);
248 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
249 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
- 1, -1);
250 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
+ offset
- 1, 1);
251 islstride
= isl_set_max_val (set
, aff
);
252 isl_val_get_num_gmp (islstride
, stride
);
253 isl_val_free (islstride
);
257 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
259 gmp_fprintf (dump_file
, "\nStride in BB_%d, DR_%d, depth %d: %Zd ",
260 pbb_index (pbb
), PDR_ID (pdr
), (int) depth
, stride
);
264 /* Sets STRIDES to the sum of all the strides of the data references
265 accessed in LOOP at DEPTH. */
268 memory_strides_in_loop_1 (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
278 FOR_EACH_VEC_ELT (LST_SEQ (loop
), j
, l
)
280 memory_strides_in_loop_1 (l
, depth
, strides
);
282 FOR_EACH_VEC_ELT (PBB_DRS (LST_PBB (l
)), i
, pdr
)
284 pdr_stride_in_loop (s
, depth
, pdr
);
285 mpz_set_si (n
, PDR_NB_REFS (pdr
));
287 mpz_add (strides
, strides
, s
);
294 /* Sets STRIDES to the sum of all the strides of the data references
295 accessed in LOOP at DEPTH. */
298 memory_strides_in_loop (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
300 if (mpz_cmp_si (loop
->memory_strides
, -1) == 0)
302 mpz_set_si (strides
, 0);
303 memory_strides_in_loop_1 (loop
, depth
, strides
);
306 mpz_set (strides
, loop
->memory_strides
);
309 /* Return true when the interchange of loops LOOP1 and LOOP2 is
322 | for (i = 0; i < N; i++)
323 | for (j = 0; j < N; j++)
329 The data access A[j][i] is described like this:
337 | 0 0 0 0 -1 0 100 >= 0
338 | 0 0 0 0 0 -1 100 >= 0
340 The linearized memory access L to A[100][100] is:
345 TODO: the shown format is not valid as it does not show the fact
346 that the iteration domain "i j" is transformed using the scattering.
348 Next, to measure the impact of iterating once in loop "i", we build
349 a maximization problem: first, we add to DR accesses the dimensions
350 k, s2, s3, L1 = 100 * s0 + s1, L2, and D1: this is the polyhedron P1.
351 L1 and L2 are the linearized memory access functions.
353 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
354 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
355 | 0 -1 0 0 1 0 0 0 0 0 0 0 0 = 0 s0 = j
356 |-2 0 0 0 0 1 0 0 0 0 0 0 0 = 0 s1 = 2 * i
357 | 0 0 0 0 1 0 0 0 0 0 0 0 0 >= 0
358 | 0 0 0 0 0 1 0 0 0 0 0 0 0 >= 0
359 | 0 0 0 0 -1 0 0 0 0 0 0 0 100 >= 0
360 | 0 0 0 0 0 -1 0 0 0 0 0 0 100 >= 0
361 | 0 0 0 0 100 1 0 0 0 -1 0 0 0 = 0 L1 = 100 * s0 + s1
363 Then, we generate the polyhedron P2 by interchanging the dimensions
364 (s0, s2), (s1, s3), (L1, L2), (k, i)
366 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
367 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
368 | 0 -1 0 0 0 0 0 1 0 0 0 0 0 = 0 s2 = j
369 | 0 0 0 0 0 0 -2 0 1 0 0 0 0 = 0 s3 = 2 * k
370 | 0 0 0 0 0 0 0 1 0 0 0 0 0 >= 0
371 | 0 0 0 0 0 0 0 0 1 0 0 0 0 >= 0
372 | 0 0 0 0 0 0 0 -1 0 0 0 0 100 >= 0
373 | 0 0 0 0 0 0 0 0 -1 0 0 0 100 >= 0
374 | 0 0 0 0 0 0 0 100 1 0 -1 0 0 = 0 L2 = 100 * s2 + s3
376 then we add to P2 the equality k = i + 1:
378 |-1 0 0 0 0 0 1 0 0 0 0 0 -1 = 0 k = i + 1
380 and finally we maximize the expression "D1 = max (P1 inter P2, L2 - L1)".
382 Similarly, to determine the impact of one iteration on loop "j", we
383 interchange (k, j), we add "k = j + 1", and we compute D2 the
384 maximal value of the difference.
386 Finally, the profitability test is D1 < D2: if in the outer loop
387 the strides are smaller than in the inner loop, then it is
388 profitable to interchange the loops at DEPTH1 and DEPTH2. */
391 lst_interchange_profitable_p (lst_p nest
, int depth1
, int depth2
)
396 gcc_assert (depth1
< depth2
);
401 memory_strides_in_loop (nest
, depth1
, d1
);
402 memory_strides_in_loop (nest
, depth2
, d2
);
404 res
= mpz_cmp (d1
, d2
) < 0;
412 /* Interchanges the loops at DEPTH1 and DEPTH2 of the original
413 scattering and assigns the resulting polyhedron to the transformed
417 pbb_interchange_loop_depths (graphite_dim_t depth1
, graphite_dim_t depth2
,
421 unsigned dim1
= psct_dynamic_dim (pbb
, depth1
);
422 unsigned dim2
= psct_dynamic_dim (pbb
, depth2
);
423 isl_space
*d
= isl_map_get_space (pbb
->transformed
);
424 isl_space
*d1
= isl_space_range (d
);
425 unsigned n
= isl_space_dim (d1
, isl_dim_out
);
426 isl_space
*d2
= isl_space_add_dims (d1
, isl_dim_in
, n
);
427 isl_map
*x
= isl_map_universe (d2
);
429 x
= isl_map_equate (x
, isl_dim_in
, dim1
, isl_dim_out
, dim2
);
430 x
= isl_map_equate (x
, isl_dim_in
, dim2
, isl_dim_out
, dim1
);
432 for (i
= 0; i
< n
; i
++)
433 if (i
!= dim1
&& i
!= dim2
)
434 x
= isl_map_equate (x
, isl_dim_in
, i
, isl_dim_out
, i
);
436 pbb
->transformed
= isl_map_apply_range (pbb
->transformed
, x
);
439 /* Apply the interchange of loops at depths DEPTH1 and DEPTH2 to all
440 the statements below LST. */
443 lst_apply_interchange (lst_p lst
, int depth1
, int depth2
)
448 if (LST_LOOP_P (lst
))
453 FOR_EACH_VEC_ELT (LST_SEQ (lst
), i
, l
)
454 lst_apply_interchange (l
, depth1
, depth2
);
457 pbb_interchange_loop_depths (depth1
, depth2
, LST_PBB (lst
));
460 /* Return true when the nest starting at LOOP1 and ending on LOOP2 is
461 perfect: i.e. there are no sequence of statements. */
464 lst_perfectly_nested_p (lst_p loop1
, lst_p loop2
)
469 if (!LST_LOOP_P (loop1
))
472 return LST_SEQ (loop1
).length () == 1
473 && lst_perfectly_nested_p (LST_SEQ (loop1
)[0], loop2
);
476 /* Transform the loop nest between LOOP1 and LOOP2 into a perfect
477 nest. To continue the naming tradition, this function is called
478 after perfect_nestify. NEST is set to the perfectly nested loop
479 that is created. BEFORE/AFTER are set to the loops distributed
480 before/after the loop NEST. */
483 lst_perfect_nestify (lst_p loop1
, lst_p loop2
, lst_p
*before
,
484 lst_p
*nest
, lst_p
*after
)
486 poly_bb_p first
, last
;
488 gcc_assert (loop1
&& loop2
490 && LST_LOOP_P (loop1
) && LST_LOOP_P (loop2
));
492 first
= LST_PBB (lst_find_first_pbb (loop2
));
493 last
= LST_PBB (lst_find_last_pbb (loop2
));
495 *before
= copy_lst (loop1
);
496 *nest
= copy_lst (loop1
);
497 *after
= copy_lst (loop1
);
499 lst_remove_all_before_including_pbb (*before
, first
, false);
500 lst_remove_all_before_including_pbb (*after
, last
, true);
502 lst_remove_all_before_excluding_pbb (*nest
, first
, true);
503 lst_remove_all_before_excluding_pbb (*nest
, last
, false);
505 if (lst_empty_p (*before
))
510 if (lst_empty_p (*after
))
515 if (lst_empty_p (*nest
))
522 /* Try to interchange LOOP1 with LOOP2 for all the statements of the
523 body of LOOP2. LOOP1 contains LOOP2. Return true if it did the
527 lst_try_interchange_loops (scop_p scop
, lst_p loop1
, lst_p loop2
)
529 int depth1
= lst_depth (loop1
);
530 int depth2
= lst_depth (loop2
);
533 lst_p before
= NULL
, nest
= NULL
, after
= NULL
;
535 if (!lst_perfectly_nested_p (loop1
, loop2
))
536 lst_perfect_nestify (loop1
, loop2
, &before
, &nest
, &after
);
538 if (!lst_interchange_profitable_p (loop2
, depth1
, depth2
))
541 lst_apply_interchange (loop2
, depth1
, depth2
);
543 /* Sync the transformed LST information and the PBB scatterings
544 before using the scatterings in the data dependence analysis. */
545 if (before
|| nest
|| after
)
547 transformed
= lst_substitute_3 (SCOP_TRANSFORMED_SCHEDULE (scop
), loop1
,
548 before
, nest
, after
);
549 lst_update_scattering (transformed
);
550 free_lst (transformed
);
553 if (graphite_legal_transform (scop
))
555 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
557 "Loops at depths %d and %d will be interchanged.\n",
560 /* Transform the SCOP_TRANSFORMED_SCHEDULE of the SCOP. */
561 lst_insert_in_sequence (before
, loop1
, true);
562 lst_insert_in_sequence (after
, loop1
, false);
566 lst_replace (loop1
, nest
);
573 /* Undo the transform. */
577 lst_apply_interchange (loop2
, depth2
, depth1
);
581 /* Selects the inner loop in LST_SEQ (INNER_FATHER) to be interchanged
582 with the loop OUTER in LST_SEQ (OUTER_FATHER). */
585 lst_interchange_select_inner (scop_p scop
, lst_p outer_father
, int outer
,
591 gcc_assert (outer_father
592 && LST_LOOP_P (outer_father
)
593 && LST_LOOP_P (LST_SEQ (outer_father
)[outer
])
595 && LST_LOOP_P (inner_father
));
597 loop1
= LST_SEQ (outer_father
)[outer
];
599 FOR_EACH_VEC_ELT (LST_SEQ (inner_father
), inner
, loop2
)
600 if (LST_LOOP_P (loop2
)
601 && (lst_try_interchange_loops (scop
, loop1
, loop2
)
602 || lst_interchange_select_inner (scop
, outer_father
, outer
, loop2
)))
608 /* Interchanges all the loops of LOOP and the loops of its body that
609 are considered profitable to interchange. Return the number of
610 interchanged loops. OUTER is the index in LST_SEQ (LOOP) that
611 points to the next outer loop to be considered for interchange. */
614 lst_interchange_select_outer (scop_p scop
, lst_p loop
, int outer
)
621 if (!loop
|| !LST_LOOP_P (loop
))
624 father
= LST_LOOP_FATHER (loop
);
627 while (lst_interchange_select_inner (scop
, father
, outer
, loop
))
630 loop
= LST_SEQ (father
)[outer
];
634 if (LST_LOOP_P (loop
))
635 FOR_EACH_VEC_ELT (LST_SEQ (loop
), i
, l
)
637 res
+= lst_interchange_select_outer (scop
, l
, i
);
642 /* Interchanges all the loop depths that are considered profitable for
643 SCOP. Return the number of interchanged loops. */
646 scop_do_interchange (scop_p scop
)
648 int res
= lst_interchange_select_outer
649 (scop
, SCOP_TRANSFORMED_SCHEDULE (scop
), 0);
651 lst_update_scattering (SCOP_TRANSFORMED_SCHEDULE (scop
));