1 /* Interchange heuristics and transform for loop interchange on
2 polyhedral representation.
4 Copyright (C) 2009-2015 Free Software Foundation, Inc.
5 Contributed by Sebastian Pop <sebastian.pop@amd.com> and
6 Harsha Jagasia <harsha.jagasia@amd.com>.
8 This file is part of GCC.
10 GCC is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 GCC is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with GCC; see the file COPYING3. If not see
22 <http://www.gnu.org/licenses/>. */
27 #include <isl/constraint.h>
31 #include <isl/union_map.h>
35 /* Since ISL-0.13, the extern is in val_gmp.h. */
36 #if !defined(HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE) && defined(__cplusplus)
39 #include <isl/val_gmp.h>
40 #if !defined(HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE) && defined(__cplusplus)
46 #include "coretypes.h"
50 #include "double-int.h"
58 #include "fold-const.h"
61 #include "hard-reg-set.h"
64 #include "dominance.h"
66 #include "basic-block.h"
67 #include "tree-ssa-alias.h"
68 #include "internal-fn.h"
69 #include "gimple-expr.h"
72 #include "gimple-iterator.h"
73 #include "tree-ssa-loop.h"
76 #include "tree-chrec.h"
77 #include "tree-data-ref.h"
78 #include "tree-scalar-evolution.h"
82 #include "graphite-poly.h"
84 /* XXX isl rewrite following comment */
85 /* Builds a linear expression, of dimension DIM, representing PDR's
88 L = r_{n}*r_{n-1}*...*r_{1}*s_{0} + ... + r_{n}*s_{n-1} + s_{n}.
90 For an array A[10][20] with two subscript locations s0 and s1, the
91 linear memory access is 20 * s0 + s1: a stride of 1 in subscript s0
92 corresponds to a memory stride of 20.
94 OFFSET is a number of dimensions to prepend before the
95 subscript dimensions: s_0, s_1, ..., s_n.
97 Thus, the final linear expression has the following format:
98 0 .. 0_{offset} | 0 .. 0_{nit} | 0 .. 0_{gd} | 0 | c_0 c_1 ... c_n
99 where the expression itself is:
100 c_0 * s_0 + c_1 * s_1 + ... c_n * s_n. */
102 static isl_constraint
*
103 build_linearized_memory_access (isl_map
*map
, poly_dr_p pdr
)
106 isl_local_space
*ls
= isl_local_space_from_space (isl_map_get_space (map
));
107 unsigned offset
, nsubs
;
111 isl_val
*size
, *subsize
, *size1
;
113 res
= isl_equality_alloc (ls
);
114 ctx
= isl_local_space_get_ctx (ls
);
115 size
= isl_val_int_from_ui (ctx
, 1);
117 nsubs
= isl_set_dim (pdr
->extent
, isl_dim_set
);
118 /* -1 for the already included L dimension. */
119 offset
= isl_map_dim (map
, isl_dim_out
) - 1 - nsubs
;
120 res
= isl_constraint_set_coefficient_si (res
, isl_dim_out
, offset
+ nsubs
, -1);
121 /* Go through all subscripts from last to first. First dimension
122 is the alias set, ignore it. */
123 for (i
= nsubs
- 1; i
>= 1; i
--)
128 size1
= isl_val_copy (size
);
129 res
= isl_constraint_set_coefficient_val (res
, isl_dim_out
, offset
+ i
, size
);
130 dc
= isl_set_get_space (pdr
->extent
);
131 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
132 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, i
, 1);
133 subsize
= isl_set_max_val (pdr
->extent
, aff
);
135 size
= isl_val_mul (size1
, subsize
);
143 /* Set STRIDE to the stride of PDR in memory by advancing by one in
144 the loop at DEPTH. */
147 pdr_stride_in_loop (mpz_t stride
, graphite_dim_t depth
, poly_dr_p pdr
)
149 poly_bb_p pbb
= PDR_PBB (pdr
);
154 isl_constraint
*lma
, *c
;
156 graphite_dim_t time_depth
;
159 /* XXX isl rewrite following comments. */
160 /* Builds a partial difference equations and inserts them
161 into pointset powerset polyhedron P. Polyhedron is assumed
162 to have the format: T|I|T'|I'|G|S|S'|l1|l2.
164 TIME_DEPTH is the time dimension w.r.t. which we are
166 OFFSET represents the number of dimensions between
167 columns t_{time_depth} and t'_{time_depth}.
168 DIM_SCTR is the number of scattering dimensions. It is
169 essentially the dimensionality of the T vector.
171 The following equations are inserted into the polyhedron P:
174 | t_{time_depth-1} = t'_{time_depth-1}
175 | t_{time_depth} = t'_{time_depth} + 1
176 | t_{time_depth+1} = t'_{time_depth + 1}
178 | t_{dim_sctr} = t'_{dim_sctr}. */
180 /* Add the equality: t_{time_depth} = t'_{time_depth} + 1.
181 This is the core part of this alogrithm, since this
182 constraint asks for the memory access stride (difference)
183 between two consecutive points in time dimensions. */
188 | t_{time_depth-1} = t'_{time_depth-1}
189 | t_{time_depth+1} = t'_{time_depth+1}
191 | t_{dim_sctr} = t'_{dim_sctr}
193 This means that all the time dimensions are equal except for
194 time_depth, where the constraint is t_{depth} = t'_{depth} + 1
195 step. More to this: we should be careful not to add equalities
196 to the 'coupled' dimensions, which happens when the one dimension
197 is stripmined dimension, and the other dimension corresponds
198 to the point loop inside stripmined dimension. */
200 /* pdr->accesses: [P1..nb_param,I1..nb_domain]->[a,S1..nb_subscript]
201 ??? [P] not used for PDRs?
202 pdr->extent: [a,S1..nb_subscript]
203 pbb->domain: [P1..nb_param,I1..nb_domain]
204 pbb->transformed: [P1..nb_param,I1..nb_domain]->[T1..Tnb_sctr]
205 [T] includes local vars (currently unused)
207 First we create [P,I] -> [T,a,S]. */
209 map
= isl_map_flat_range_product (isl_map_copy (pbb
->transformed
),
210 isl_map_copy (pdr
->accesses
));
211 /* Add a dimension for L: [P,I] -> [T,a,S,L].*/
212 map
= isl_map_add_dims (map
, isl_dim_out
, 1);
213 /* Build a constraint for "lma[S] - L == 0", effectively calculating
214 L in terms of subscripts. */
215 lma
= build_linearized_memory_access (map
, pdr
);
216 /* And add it to the map, so we now have:
217 [P,I] -> [T,a,S,L] : lma([S]) == L. */
218 map
= isl_map_add_constraint (map
, lma
);
220 /* Then we create [P,I,P',I'] -> [T,a,S,L,T',a',S',L']. */
221 map
= isl_map_flat_product (map
, isl_map_copy (map
));
223 /* Now add the equality T[time_depth] == T'[time_depth]+1. This will
224 force L' to be the linear address at T[time_depth] + 1. */
225 time_depth
= psct_dynamic_dim (pbb
, depth
);
226 /* Length of [a,S] plus [L] ... */
227 offset
= 1 + isl_map_dim (pdr
->accesses
, isl_dim_out
);
229 offset
+= isl_map_dim (pbb
->transformed
, isl_dim_out
);
231 c
= isl_equality_alloc (isl_local_space_from_space (isl_map_get_space (map
)));
232 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
, time_depth
, 1);
233 c
= isl_constraint_set_coefficient_si (c
, isl_dim_out
,
234 offset
+ time_depth
, -1);
235 c
= isl_constraint_set_constant_si (c
, 1);
236 map
= isl_map_add_constraint (map
, c
);
238 /* Now we equate most of the T/T' elements (making PITaSL nearly
239 the same is (PITaSL)', except for one dimension, namely for 'depth'
240 (an index into [I]), after translating to index into [T]. Take care
241 to not produce an empty map, which indicates we wanted to equate
242 two dimensions that are already coupled via the above time_depth
243 dimension. Happens with strip mining where several scatter dimension
244 are interdependend. */
246 nt
= pbb_nb_scattering_transform (pbb
) + pbb_nb_local_vars (pbb
);
247 for (i
= 0; i
< nt
; i
++)
250 isl_map
*temp
= isl_map_equate (isl_map_copy (map
),
252 isl_dim_out
, offset
+ i
);
253 if (isl_map_is_empty (temp
))
262 /* Now maximize the expression L' - L. */
263 set
= isl_map_range (map
);
264 dc
= isl_set_get_space (set
);
265 aff
= isl_aff_zero_on_domain (isl_local_space_from_space (dc
));
266 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
- 1, -1);
267 aff
= isl_aff_set_coefficient_si (aff
, isl_dim_in
, offset
+ offset
- 1, 1);
268 islstride
= isl_set_max_val (set
, aff
);
269 isl_val_get_num_gmp (islstride
, stride
);
270 isl_val_free (islstride
);
274 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
276 gmp_fprintf (dump_file
, "\nStride in BB_%d, DR_%d, depth %d: %Zd ",
277 pbb_index (pbb
), PDR_ID (pdr
), (int) depth
, stride
);
281 /* Sets STRIDES to the sum of all the strides of the data references
282 accessed in LOOP at DEPTH. */
285 memory_strides_in_loop_1 (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
295 FOR_EACH_VEC_ELT (LST_SEQ (loop
), j
, l
)
297 memory_strides_in_loop_1 (l
, depth
, strides
);
299 FOR_EACH_VEC_ELT (PBB_DRS (LST_PBB (l
)), i
, pdr
)
301 pdr_stride_in_loop (s
, depth
, pdr
);
302 mpz_set_si (n
, PDR_NB_REFS (pdr
));
304 mpz_add (strides
, strides
, s
);
311 /* Sets STRIDES to the sum of all the strides of the data references
312 accessed in LOOP at DEPTH. */
315 memory_strides_in_loop (lst_p loop
, graphite_dim_t depth
, mpz_t strides
)
317 if (mpz_cmp_si (loop
->memory_strides
, -1) == 0)
319 mpz_set_si (strides
, 0);
320 memory_strides_in_loop_1 (loop
, depth
, strides
);
323 mpz_set (strides
, loop
->memory_strides
);
326 /* Return true when the interchange of loops LOOP1 and LOOP2 is
339 | for (i = 0; i < N; i++)
340 | for (j = 0; j < N; j++)
346 The data access A[j][i] is described like this:
354 | 0 0 0 0 -1 0 100 >= 0
355 | 0 0 0 0 0 -1 100 >= 0
357 The linearized memory access L to A[100][100] is:
362 TODO: the shown format is not valid as it does not show the fact
363 that the iteration domain "i j" is transformed using the scattering.
365 Next, to measure the impact of iterating once in loop "i", we build
366 a maximization problem: first, we add to DR accesses the dimensions
367 k, s2, s3, L1 = 100 * s0 + s1, L2, and D1: this is the polyhedron P1.
368 L1 and L2 are the linearized memory access functions.
370 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
371 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
372 | 0 -1 0 0 1 0 0 0 0 0 0 0 0 = 0 s0 = j
373 |-2 0 0 0 0 1 0 0 0 0 0 0 0 = 0 s1 = 2 * i
374 | 0 0 0 0 1 0 0 0 0 0 0 0 0 >= 0
375 | 0 0 0 0 0 1 0 0 0 0 0 0 0 >= 0
376 | 0 0 0 0 -1 0 0 0 0 0 0 0 100 >= 0
377 | 0 0 0 0 0 -1 0 0 0 0 0 0 100 >= 0
378 | 0 0 0 0 100 1 0 0 0 -1 0 0 0 = 0 L1 = 100 * s0 + s1
380 Then, we generate the polyhedron P2 by interchanging the dimensions
381 (s0, s2), (s1, s3), (L1, L2), (k, i)
383 | i j N a s0 s1 k s2 s3 L1 L2 D1 1
384 | 0 0 0 1 0 0 0 0 0 0 0 0 -5 = 0 alias = 5
385 | 0 -1 0 0 0 0 0 1 0 0 0 0 0 = 0 s2 = j
386 | 0 0 0 0 0 0 -2 0 1 0 0 0 0 = 0 s3 = 2 * k
387 | 0 0 0 0 0 0 0 1 0 0 0 0 0 >= 0
388 | 0 0 0 0 0 0 0 0 1 0 0 0 0 >= 0
389 | 0 0 0 0 0 0 0 -1 0 0 0 0 100 >= 0
390 | 0 0 0 0 0 0 0 0 -1 0 0 0 100 >= 0
391 | 0 0 0 0 0 0 0 100 1 0 -1 0 0 = 0 L2 = 100 * s2 + s3
393 then we add to P2 the equality k = i + 1:
395 |-1 0 0 0 0 0 1 0 0 0 0 0 -1 = 0 k = i + 1
397 and finally we maximize the expression "D1 = max (P1 inter P2, L2 - L1)".
399 Similarly, to determine the impact of one iteration on loop "j", we
400 interchange (k, j), we add "k = j + 1", and we compute D2 the
401 maximal value of the difference.
403 Finally, the profitability test is D1 < D2: if in the outer loop
404 the strides are smaller than in the inner loop, then it is
405 profitable to interchange the loops at DEPTH1 and DEPTH2. */
408 lst_interchange_profitable_p (lst_p nest
, int depth1
, int depth2
)
413 gcc_assert (depth1
< depth2
);
418 memory_strides_in_loop (nest
, depth1
, d1
);
419 memory_strides_in_loop (nest
, depth2
, d2
);
421 res
= mpz_cmp (d1
, d2
) < 0;
429 /* Interchanges the loops at DEPTH1 and DEPTH2 of the original
430 scattering and assigns the resulting polyhedron to the transformed
434 pbb_interchange_loop_depths (graphite_dim_t depth1
, graphite_dim_t depth2
,
438 unsigned dim1
= psct_dynamic_dim (pbb
, depth1
);
439 unsigned dim2
= psct_dynamic_dim (pbb
, depth2
);
440 isl_space
*d
= isl_map_get_space (pbb
->transformed
);
441 isl_space
*d1
= isl_space_range (d
);
442 unsigned n
= isl_space_dim (d1
, isl_dim_out
);
443 isl_space
*d2
= isl_space_add_dims (d1
, isl_dim_in
, n
);
444 isl_map
*x
= isl_map_universe (d2
);
446 x
= isl_map_equate (x
, isl_dim_in
, dim1
, isl_dim_out
, dim2
);
447 x
= isl_map_equate (x
, isl_dim_in
, dim2
, isl_dim_out
, dim1
);
449 for (i
= 0; i
< n
; i
++)
450 if (i
!= dim1
&& i
!= dim2
)
451 x
= isl_map_equate (x
, isl_dim_in
, i
, isl_dim_out
, i
);
453 pbb
->transformed
= isl_map_apply_range (pbb
->transformed
, x
);
456 /* Apply the interchange of loops at depths DEPTH1 and DEPTH2 to all
457 the statements below LST. */
460 lst_apply_interchange (lst_p lst
, int depth1
, int depth2
)
465 if (LST_LOOP_P (lst
))
470 FOR_EACH_VEC_ELT (LST_SEQ (lst
), i
, l
)
471 lst_apply_interchange (l
, depth1
, depth2
);
474 pbb_interchange_loop_depths (depth1
, depth2
, LST_PBB (lst
));
477 /* Return true when the nest starting at LOOP1 and ending on LOOP2 is
478 perfect: i.e. there are no sequence of statements. */
481 lst_perfectly_nested_p (lst_p loop1
, lst_p loop2
)
486 if (!LST_LOOP_P (loop1
))
489 return LST_SEQ (loop1
).length () == 1
490 && lst_perfectly_nested_p (LST_SEQ (loop1
)[0], loop2
);
493 /* Transform the loop nest between LOOP1 and LOOP2 into a perfect
494 nest. To continue the naming tradition, this function is called
495 after perfect_nestify. NEST is set to the perfectly nested loop
496 that is created. BEFORE/AFTER are set to the loops distributed
497 before/after the loop NEST. */
500 lst_perfect_nestify (lst_p loop1
, lst_p loop2
, lst_p
*before
,
501 lst_p
*nest
, lst_p
*after
)
503 poly_bb_p first
, last
;
505 gcc_assert (loop1
&& loop2
507 && LST_LOOP_P (loop1
) && LST_LOOP_P (loop2
));
509 first
= LST_PBB (lst_find_first_pbb (loop2
));
510 last
= LST_PBB (lst_find_last_pbb (loop2
));
512 *before
= copy_lst (loop1
);
513 *nest
= copy_lst (loop1
);
514 *after
= copy_lst (loop1
);
516 lst_remove_all_before_including_pbb (*before
, first
, false);
517 lst_remove_all_before_including_pbb (*after
, last
, true);
519 lst_remove_all_before_excluding_pbb (*nest
, first
, true);
520 lst_remove_all_before_excluding_pbb (*nest
, last
, false);
522 if (lst_empty_p (*before
))
527 if (lst_empty_p (*after
))
532 if (lst_empty_p (*nest
))
539 /* Try to interchange LOOP1 with LOOP2 for all the statements of the
540 body of LOOP2. LOOP1 contains LOOP2. Return true if it did the
544 lst_try_interchange_loops (scop_p scop
, lst_p loop1
, lst_p loop2
)
546 int depth1
= lst_depth (loop1
);
547 int depth2
= lst_depth (loop2
);
550 lst_p before
= NULL
, nest
= NULL
, after
= NULL
;
552 if (!lst_perfectly_nested_p (loop1
, loop2
))
553 lst_perfect_nestify (loop1
, loop2
, &before
, &nest
, &after
);
555 if (!lst_interchange_profitable_p (loop2
, depth1
, depth2
))
558 lst_apply_interchange (loop2
, depth1
, depth2
);
560 /* Sync the transformed LST information and the PBB scatterings
561 before using the scatterings in the data dependence analysis. */
562 if (before
|| nest
|| after
)
564 transformed
= lst_substitute_3 (SCOP_TRANSFORMED_SCHEDULE (scop
), loop1
,
565 before
, nest
, after
);
566 lst_update_scattering (transformed
);
567 free_lst (transformed
);
570 if (graphite_legal_transform (scop
))
572 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
574 "Loops at depths %d and %d will be interchanged.\n",
577 /* Transform the SCOP_TRANSFORMED_SCHEDULE of the SCOP. */
578 lst_insert_in_sequence (before
, loop1
, true);
579 lst_insert_in_sequence (after
, loop1
, false);
583 lst_replace (loop1
, nest
);
590 /* Undo the transform. */
594 lst_apply_interchange (loop2
, depth2
, depth1
);
598 /* Selects the inner loop in LST_SEQ (INNER_FATHER) to be interchanged
599 with the loop OUTER in LST_SEQ (OUTER_FATHER). */
602 lst_interchange_select_inner (scop_p scop
, lst_p outer_father
, int outer
,
608 gcc_assert (outer_father
609 && LST_LOOP_P (outer_father
)
610 && LST_LOOP_P (LST_SEQ (outer_father
)[outer
])
612 && LST_LOOP_P (inner_father
));
614 loop1
= LST_SEQ (outer_father
)[outer
];
616 FOR_EACH_VEC_ELT (LST_SEQ (inner_father
), inner
, loop2
)
617 if (LST_LOOP_P (loop2
)
618 && (lst_try_interchange_loops (scop
, loop1
, loop2
)
619 || lst_interchange_select_inner (scop
, outer_father
, outer
, loop2
)))
625 /* Interchanges all the loops of LOOP and the loops of its body that
626 are considered profitable to interchange. Return the number of
627 interchanged loops. OUTER is the index in LST_SEQ (LOOP) that
628 points to the next outer loop to be considered for interchange. */
631 lst_interchange_select_outer (scop_p scop
, lst_p loop
, int outer
)
638 if (!loop
|| !LST_LOOP_P (loop
))
641 father
= LST_LOOP_FATHER (loop
);
644 while (lst_interchange_select_inner (scop
, father
, outer
, loop
))
647 loop
= LST_SEQ (father
)[outer
];
651 if (LST_LOOP_P (loop
))
652 FOR_EACH_VEC_ELT (LST_SEQ (loop
), i
, l
)
654 res
+= lst_interchange_select_outer (scop
, l
, i
);
659 /* Interchanges all the loop depths that are considered profitable for
660 SCOP. Return the number of interchanged loops. */
663 scop_do_interchange (scop_p scop
)
665 int res
= lst_interchange_select_outer
666 (scop
, SCOP_TRANSFORMED_SCHEDULE (scop
), 0);
668 lst_update_scattering (SCOP_TRANSFORMED_SCHEDULE (scop
));