1 /* Interchange heuristics and transform for loop interchange on
2 polyhedral representation.
4 Copyright (C) 2009-2015 Free Software Foundation, Inc.
5 Contributed by Sebastian Pop <sebastian.pop@amd.com> and
6 Harsha Jagasia <harsha.jagasia@amd.com>.
8 This file is part of GCC.
10 GCC is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 GCC is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with GCC; see the file COPYING3. If not see
22 <http://www.gnu.org/licenses/>. */
30 #include <isl/union_map.h>
34 /* Since ISL-0.13, the extern is in val_gmp.h. */
35 #if !defined(HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE) && defined(__cplusplus)
38 #include <isl/val_gmp.h>
39 #if !defined(HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE) && defined(__cplusplus)
45 #include "coretypes.h"
49 #include "double-int.h"
57 #include "fold-const.h"
60 #include "hard-reg-set.h"
63 #include "dominance.h"
65 #include "basic-block.h"
66 #include "tree-ssa-alias.h"
67 #include "internal-fn.h"
68 #include "gimple-expr.h"
71 #include "gimple-iterator.h"
72 #include "tree-ssa-loop.h"
75 #include "tree-chrec.h"
76 #include "tree-data-ref.h"
77 #include "tree-scalar-evolution.h"
81 #include "graphite-poly.h"
83 /* XXX isl rewrite following comment */
84 /* Builds a linear expression, of dimension DIM, representing PDR's
87 L = r_{n}*r_{n-1}*...*r_{1}*s_{0} + ... + r_{n}*s_{n-1} + s_{n}.
89 For an array A[10][20] with two subscript locations s0 and s1, the
90 linear memory access is 20 * s0 + s1: a stride of 1 in subscript s0
91 corresponds to a memory stride of 20.
93 OFFSET is a number of dimensions to prepend before the
94 subscript dimensions: s_0, s_1, ..., s_n.
96 Thus, the final linear expression has the following format:
97 0 .. 0_{offset} | 0 .. 0_{nit} | 0 .. 0_{gd} | 0 | c_0 c_1 ... c_n
98 where the expression itself is:
99 c_0 * s_0 + c_1 * s_1 + ... c_n * s_n. */
101 static isl_constraint
*
102 build_linearized_memory_access (isl_map
*map
, poly_dr_p pdr
)
105 isl_local_space
*ls
= isl_local_space_from_space (isl_map_get_space (map
));
106 unsigned offset
, nsubs
;
110 isl_val
*size
, *subsize
, *size1
;
112 res
= isl_equality_alloc (ls
);
113 ctx
= isl_local_space_get_ctx (ls
);
114 size
= isl_val_int_from_ui (ctx
, 1);
116 nsubs
= isl_set_dim (pdr
->extent
, isl_dim_set
);
117 /* -1 for the already included L dimension. */
118 offset
= isl_map_dim (map
, isl_dim_out
) - 1 - nsubs
;
119 res
= isl_constraint_set_coefficient_si (res
, isl_dim_out
, offset
+ nsubs
, -1);
120 /* Go through all subscripts from last to first. First dimension
121 is the alias set, ignore it. */
122 for (i
= nsubs
- 1; i
>= 1; i
--)
127 size1
= isl_val_copy (size
);
128 res
= isl_constraint_set_coefficient_val (res
, isl_dim_out
, offset
+ i
, size
);
129 dc
= isl_set_get_space (pdr
->extent
);
130 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
131 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, i
, 1);
132 subsize
= isl_set_max_val (pdr
->extent
, aff
);
134 size
= isl_val_mul (size1
, subsize
);
142 /* Set STRIDE to the stride of PDR in memory by advancing by one in
143 the loop at DEPTH. */
146 pdr_stride_in_loop (mpz_t stride
, graphite_dim_t depth
, poly_dr_p pdr
)
148 poly_bb_p pbb
= PDR_PBB (pdr
);
153 isl_constraint
*lma
, *c
;
155 graphite_dim_t time_depth
;
158 /* XXX isl rewrite following comments. */
159 /* Builds a partial difference equations and inserts them
160 into pointset powerset polyhedron P. Polyhedron is assumed
161 to have the format: T|I|T'|I'|G|S|S'|l1|l2.
163 TIME_DEPTH is the time dimension w.r.t. which we are
165 OFFSET represents the number of dimensions between
166 columns t_{time_depth} and t'_{time_depth}.
167 DIM_SCTR is the number of scattering dimensions. It is
168 essentially the dimensionality of the T vector.
170 The following equations are inserted into the polyhedron P:
173 | t_{time_depth-1} = t'_{time_depth-1}
174 | t_{time_depth} = t'_{time_depth} + 1
175 | t_{time_depth+1} = t'_{time_depth + 1}
177 | t_{dim_sctr} = t'_{dim_sctr}. */
179 /* Add the equality: t_{time_depth} = t'_{time_depth} + 1.
180 This is the core part of this alogrithm, since this
181 constraint asks for the memory access stride (difference)
182 between two consecutive points in time dimensions. */
187 | t_{time_depth-1} = t'_{time_depth-1}
188 | t_{time_depth+1} = t'_{time_depth+1}
190 | t_{dim_sctr} = t'_{dim_sctr}
192 This means that all the time dimensions are equal except for
193 time_depth, where the constraint is t_{depth} = t'_{depth} + 1
194 step. More to this: we should be careful not to add equalities
195 to the 'coupled' dimensions, which happens when the one dimension
196 is stripmined dimension, and the other dimension corresponds
197 to the point loop inside stripmined dimension. */
199 /* pdr->accesses: [P1..nb_param,I1..nb_domain]->[a,S1..nb_subscript]
200 ??? [P] not used for PDRs?
201 pdr->extent: [a,S1..nb_subscript]
202 pbb->domain: [P1..nb_param,I1..nb_domain]
203 pbb->transformed: [P1..nb_param,I1..nb_domain]->[T1..Tnb_sctr]
204 [T] includes local vars (currently unused)
206 First we create [P,I] -> [T,a,S]. */
208 map
= isl_map_flat_range_product (isl_map_copy (pbb
->transformed
),
209 isl_map_copy (pdr
->accesses
));
210 /* Add a dimension for L: [P,I] -> [T,a,S,L].*/
211 map
= isl_map_add_dims (map
, isl_dim_out
, 1);
212 /* Build a constraint for "lma[S] - L == 0", effectively calculating
213 L in terms of subscripts. */
214 lma
= build_linearized_memory_access (map
, pdr
);
215 /* And add it to the map, so we now have:
216 [P,I] -> [T,a,S,L] : lma([S]) == L. */
217 map
= isl_map_add_constraint (map
, lma
);
219 /* Then we create [P,I,P',I'] -> [T,a,S,L,T',a',S',L']. */
220 map
= isl_map_flat_product (map
, isl_map_copy (map
));
222 /* Now add the equality T[time_depth] == T'[time_depth]+1. This will
223 force L' to be the linear address at T[time_depth] + 1. */
224 time_depth
= psct_dynamic_dim (pbb
, depth
);
225 /* Length of [a,S] plus [L] ... */
226 offset
= 1 + isl_map_dim (pdr
->accesses
, isl_dim_out
);
228 offset
+= isl_map_dim (pbb
->transformed
, isl_dim_out
);
230 c
= isl_equality_alloc (isl_local_space_from_space (isl_map_get_space (map
)));
231 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
, time_depth
, 1);
232 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
,
233 offset
+ time_depth
, -1);
234 c
= isl_constraint_set_constant_si (c
, 1);
235 map
= isl_map_add_constraint (map
, c
);
237 /* Now we equate most of the T/T' elements (making PITaSL nearly
238 the same is (PITaSL)', except for one dimension, namely for 'depth'
239 (an index into [I]), after translating to index into [T]. Take care
240 to not produce an empty map, which indicates we wanted to equate
241 two dimensions that are already coupled via the above time_depth
242 dimension. Happens with strip mining where several scatter dimension
243 are interdependend. */
245 nt
= pbb_nb_scattering_transform (pbb
) + pbb_nb_local_vars (pbb
);
246 for (i
= 0; i
< nt
; i
++)
249 isl_map
*temp
= isl_map_equate (isl_map_copy (map
),
251 isl_dim_out
, offset
+ i
);
252 if (isl_map_is_empty (temp
))
261 /* Now maximize the expression L' - L. */
262 set
= isl_map_range (map
);
263 dc
= isl_set_get_space (set
);
264 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
265 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
- 1, -1);
266 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
+ offset
- 1, 1);
267 islstride
= isl_set_max_val (set
, aff
);
268 isl_val_get_num_gmp (islstride
, stride
);
269 isl_val_free (islstride
);
273 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
275 gmp_fprintf (dump_file
, "\nStride in BB_%d, DR_%d, depth %d: %Zd ",
276 pbb_index (pbb
), PDR_ID (pdr
), (int) depth
, stride
);
280 /* Sets STRIDES to the sum of all the strides of the data references
281 accessed in LOOP at DEPTH. */
284 memory_strides_in_loop_1 (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
294 FOR_EACH_VEC_ELT (LST_SEQ (loop
), j
, l
)
296 memory_strides_in_loop_1 (l
, depth
, strides
);
298 FOR_EACH_VEC_ELT (PBB_DRS (LST_PBB (l
)), i
, pdr
)
300 pdr_stride_in_loop (s
, depth
, pdr
);
301 mpz_set_si (n
, PDR_NB_REFS (pdr
));
303 mpz_add (strides
, strides
, s
);
310 /* Sets STRIDES to the sum of all the strides of the data references
311 accessed in LOOP at DEPTH. */
314 memory_strides_in_loop (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
316 if (mpz_cmp_si (loop
->memory_strides
, -1) == 0)
318 mpz_set_si (strides
, 0);
319 memory_strides_in_loop_1 (loop
, depth
, strides
);
322 mpz_set (strides
, loop
->memory_strides
);
325 /* Return true when the interchange of loops LOOP1 and LOOP2 is
338 | for (i = 0; i < N; i++)
339 | for (j = 0; j < N; j++)
345 The data access A[j][i] is described like this:
353 | 0 0 0 0 -1 0 100 >= 0
354 | 0 0 0 0 0 -1 100 >= 0
356 The linearized memory access L to A[100][100] is:
361 TODO: the shown format is not valid as it does not show the fact
362 that the iteration domain "i j" is transformed using the scattering.
364 Next, to measure the impact of iterating once in loop "i", we build
365 a maximization problem: first, we add to DR accesses the dimensions
366 k, s2, s3, L1 = 100 * s0 + s1, L2, and D1: this is the polyhedron P1.
367 L1 and L2 are the linearized memory access functions.
369 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
370 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
371 | 0 -1 0 0 1 0 0 0 0 0 0 0 0 = 0 s0 = j
372 |-2 0 0 0 0 1 0 0 0 0 0 0 0 = 0 s1 = 2 * i
373 | 0 0 0 0 1 0 0 0 0 0 0 0 0 >= 0
374 | 0 0 0 0 0 1 0 0 0 0 0 0 0 >= 0
375 | 0 0 0 0 -1 0 0 0 0 0 0 0 100 >= 0
376 | 0 0 0 0 0 -1 0 0 0 0 0 0 100 >= 0
377 | 0 0 0 0 100 1 0 0 0 -1 0 0 0 = 0 L1 = 100 * s0 + s1
379 Then, we generate the polyhedron P2 by interchanging the dimensions
380 (s0, s2), (s1, s3), (L1, L2), (k, i)
382 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
383 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
384 | 0 -1 0 0 0 0 0 1 0 0 0 0 0 = 0 s2 = j
385 | 0 0 0 0 0 0 -2 0 1 0 0 0 0 = 0 s3 = 2 * k
386 | 0 0 0 0 0 0 0 1 0 0 0 0 0 >= 0
387 | 0 0 0 0 0 0 0 0 1 0 0 0 0 >= 0
388 | 0 0 0 0 0 0 0 -1 0 0 0 0 100 >= 0
389 | 0 0 0 0 0 0 0 0 -1 0 0 0 100 >= 0
390 | 0 0 0 0 0 0 0 100 1 0 -1 0 0 = 0 L2 = 100 * s2 + s3
392 then we add to P2 the equality k = i + 1:
394 |-1 0 0 0 0 0 1 0 0 0 0 0 -1 = 0 k = i + 1
396 and finally we maximize the expression "D1 = max (P1 inter P2, L2 - L1)".
398 Similarly, to determine the impact of one iteration on loop "j", we
399 interchange (k, j), we add "k = j + 1", and we compute D2 the
400 maximal value of the difference.
402 Finally, the profitability test is D1 < D2: if in the outer loop
403 the strides are smaller than in the inner loop, then it is
404 profitable to interchange the loops at DEPTH1 and DEPTH2. */
407 lst_interchange_profitable_p (lst_p nest
, int depth1
, int depth2
)
412 gcc_assert (depth1
< depth2
);
417 memory_strides_in_loop (nest
, depth1
, d1
);
418 memory_strides_in_loop (nest
, depth2
, d2
);
420 res
= mpz_cmp (d1
, d2
) < 0;
428 /* Interchanges the loops at DEPTH1 and DEPTH2 of the original
429 scattering and assigns the resulting polyhedron to the transformed
433 pbb_interchange_loop_depths (graphite_dim_t depth1
, graphite_dim_t depth2
,
437 unsigned dim1
= psct_dynamic_dim (pbb
, depth1
);
438 unsigned dim2
= psct_dynamic_dim (pbb
, depth2
);
439 isl_space
*d
= isl_map_get_space (pbb
->transformed
);
440 isl_space
*d1
= isl_space_range (d
);
441 unsigned n
= isl_space_dim (d1
, isl_dim_out
);
442 isl_space
*d2
= isl_space_add_dims (d1
, isl_dim_in
, n
);
443 isl_map
*x
= isl_map_universe (d2
);
445 x
= isl_map_equate (x
, isl_dim_in
, dim1
, isl_dim_out
, dim2
);
446 x
= isl_map_equate (x
, isl_dim_in
, dim2
, isl_dim_out
, dim1
);
448 for (i
= 0; i
< n
; i
++)
449 if (i
!= dim1
&& i
!= dim2
)
450 x
= isl_map_equate (x
, isl_dim_in
, i
, isl_dim_out
, i
);
452 pbb
->transformed
= isl_map_apply_range (pbb
->transformed
, x
);
455 /* Apply the interchange of loops at depths DEPTH1 and DEPTH2 to all
456 the statements below LST. */
459 lst_apply_interchange (lst_p lst
, int depth1
, int depth2
)
464 if (LST_LOOP_P (lst
))
469 FOR_EACH_VEC_ELT (LST_SEQ (lst
), i
, l
)
470 lst_apply_interchange (l
, depth1
, depth2
);
473 pbb_interchange_loop_depths (depth1
, depth2
, LST_PBB (lst
));
476 /* Return true when the nest starting at LOOP1 and ending on LOOP2 is
477 perfect: i.e. there are no sequence of statements. */
480 lst_perfectly_nested_p (lst_p loop1
, lst_p loop2
)
485 if (!LST_LOOP_P (loop1
))
488 return LST_SEQ (loop1
).length () == 1
489 && lst_perfectly_nested_p (LST_SEQ (loop1
)[0], loop2
);
492 /* Transform the loop nest between LOOP1 and LOOP2 into a perfect
493 nest. To continue the naming tradition, this function is called
494 after perfect_nestify. NEST is set to the perfectly nested loop
495 that is created. BEFORE/AFTER are set to the loops distributed
496 before/after the loop NEST. */
499 lst_perfect_nestify (lst_p loop1
, lst_p loop2
, lst_p
*before
,
500 lst_p
*nest
, lst_p
*after
)
502 poly_bb_p first
, last
;
504 gcc_assert (loop1
&& loop2
506 && LST_LOOP_P (loop1
) && LST_LOOP_P (loop2
));
508 first
= LST_PBB (lst_find_first_pbb (loop2
));
509 last
= LST_PBB (lst_find_last_pbb (loop2
));
511 *before
= copy_lst (loop1
);
512 *nest
= copy_lst (loop1
);
513 *after
= copy_lst (loop1
);
515 lst_remove_all_before_including_pbb (*before
, first
, false);
516 lst_remove_all_before_including_pbb (*after
, last
, true);
518 lst_remove_all_before_excluding_pbb (*nest
, first
, true);
519 lst_remove_all_before_excluding_pbb (*nest
, last
, false);
521 if (lst_empty_p (*before
))
526 if (lst_empty_p (*after
))
531 if (lst_empty_p (*nest
))
538 /* Try to interchange LOOP1 with LOOP2 for all the statements of the
539 body of LOOP2. LOOP1 contains LOOP2. Return true if it did the
543 lst_try_interchange_loops (scop_p scop
, lst_p loop1
, lst_p loop2
)
545 int depth1
= lst_depth (loop1
);
546 int depth2
= lst_depth (loop2
);
549 lst_p before
= NULL
, nest
= NULL
, after
= NULL
;
551 if (!lst_perfectly_nested_p (loop1
, loop2
))
552 lst_perfect_nestify (loop1
, loop2
, &before
, &nest
, &after
);
554 if (!lst_interchange_profitable_p (loop2
, depth1
, depth2
))
557 lst_apply_interchange (loop2
, depth1
, depth2
);
559 /* Sync the transformed LST information and the PBB scatterings
560 before using the scatterings in the data dependence analysis. */
561 if (before
|| nest
|| after
)
563 transformed
= lst_substitute_3 (SCOP_TRANSFORMED_SCHEDULE (scop
), loop1
,
564 before
, nest
, after
);
565 lst_update_scattering (transformed
);
566 free_lst (transformed
);
569 if (graphite_legal_transform (scop
))
571 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
573 "Loops at depths %d and %d will be interchanged.\n",
576 /* Transform the SCOP_TRANSFORMED_SCHEDULE of the SCOP. */
577 lst_insert_in_sequence (before
, loop1
, true);
578 lst_insert_in_sequence (after
, loop1
, false);
582 lst_replace (loop1
, nest
);
589 /* Undo the transform. */
593 lst_apply_interchange (loop2
, depth2
, depth1
);
597 /* Selects the inner loop in LST_SEQ (INNER_FATHER) to be interchanged
598 with the loop OUTER in LST_SEQ (OUTER_FATHER). */
601 lst_interchange_select_inner (scop_p scop
, lst_p outer_father
, int outer
,
607 gcc_assert (outer_father
608 && LST_LOOP_P (outer_father
)
609 && LST_LOOP_P (LST_SEQ (outer_father
)[outer
])
611 && LST_LOOP_P (inner_father
));
613 loop1
= LST_SEQ (outer_father
)[outer
];
615 FOR_EACH_VEC_ELT (LST_SEQ (inner_father
), inner
, loop2
)
616 if (LST_LOOP_P (loop2
)
617 && (lst_try_interchange_loops (scop
, loop1
, loop2
)
618 || lst_interchange_select_inner (scop
, outer_father
, outer
, loop2
)))
624 /* Interchanges all the loops of LOOP and the loops of its body that
625 are considered profitable to interchange. Return the number of
626 interchanged loops. OUTER is the index in LST_SEQ (LOOP) that
627 points to the next outer loop to be considered for interchange. */
630 lst_interchange_select_outer (scop_p scop
, lst_p loop
, int outer
)
637 if (!loop
|| !LST_LOOP_P (loop
))
640 father
= LST_LOOP_FATHER (loop
);
643 while (lst_interchange_select_inner (scop
, father
, outer
, loop
))
646 loop
= LST_SEQ (father
)[outer
];
650 if (LST_LOOP_P (loop
))
651 FOR_EACH_VEC_ELT (LST_SEQ (loop
), i
, l
)
653 res
+= lst_interchange_select_outer (scop
, l
, i
);
658 /* Interchanges all the loop depths that are considered profitable for
659 SCOP. Return the number of interchanged loops. */
662 scop_do_interchange (scop_p scop
)
664 int res
= lst_interchange_select_outer
665 (scop
, SCOP_TRANSFORMED_SCHEDULE (scop
), 0);
667 lst_update_scattering (SCOP_TRANSFORMED_SCHEDULE (scop
));