PR target/44750
[official-gcc.git] / gcc / tree-loop-linear.c
bloba411c25c91cc4534608334f8ca37ff96c56ae602
1 /* Linear Loop transforms
2 Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Daniel Berlin <dberlin@dberlin.org>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "basic-block.h"
29 #include "obstack.h"
30 #include "tree-flow.h"
31 #include "tree-dump.h"
32 #include "timevar.h"
33 #include "cfgloop.h"
34 #include "tree-chrec.h"
35 #include "tree-data-ref.h"
36 #include "tree-scalar-evolution.h"
37 #include "tree-pass.h"
38 #include "lambda.h"
40 /* Linear loop transforms include any composition of interchange,
41 scaling, skewing, and reversal. They are used to change the
42 iteration order of loop nests in order to optimize data locality of
43 traversals, or remove dependences that prevent
44 parallelization/vectorization/etc.
46 TODO: Determine reuse vectors/matrix and use it to determine optimal
47 transform matrix for locality purposes.
48 TODO: Completion of partial transforms. */
50 /* Gather statistics for loop interchange. LOOP is the loop being
51 considered. The first loop in the considered loop nest is
52 FIRST_LOOP, and consequently, the index of the considered loop is
53 obtained by LOOP->DEPTH - FIRST_LOOP->DEPTH
55 Initializes:
56 - DEPENDENCE_STEPS the sum of all the data dependence distances
57 carried by loop LOOP,
59 - NB_DEPS_NOT_CARRIED_BY_LOOP the number of dependence relations
60 for which the loop LOOP is not carrying any dependence,
62 - ACCESS_STRIDES the sum of all the strides in LOOP.
64 Example: for the following loop,
66 | loop_1 runs 1335 times
67 | loop_2 runs 1335 times
68 | A[{{0, +, 1}_1, +, 1335}_2]
69 | B[{{0, +, 1}_1, +, 1335}_2]
70 | endloop_2
71 | A[{0, +, 1336}_1]
72 | endloop_1
74 gather_interchange_stats (in loop_1) will return
75 DEPENDENCE_STEPS = 3002
76 NB_DEPS_NOT_CARRIED_BY_LOOP = 5
77 ACCESS_STRIDES = 10694
79 gather_interchange_stats (in loop_2) will return
80 DEPENDENCE_STEPS = 3000
81 NB_DEPS_NOT_CARRIED_BY_LOOP = 7
82 ACCESS_STRIDES = 8010
85 static void
86 gather_interchange_stats (VEC (ddr_p, heap) *dependence_relations ATTRIBUTE_UNUSED,
87 VEC (data_reference_p, heap) *datarefs ATTRIBUTE_UNUSED,
88 struct loop *loop ATTRIBUTE_UNUSED,
89 struct loop *first_loop ATTRIBUTE_UNUSED,
90 unsigned int *dependence_steps ATTRIBUTE_UNUSED,
91 unsigned int *nb_deps_not_carried_by_loop ATTRIBUTE_UNUSED,
92 double_int *access_strides ATTRIBUTE_UNUSED)
94 unsigned int i, j;
95 struct data_dependence_relation *ddr;
96 struct data_reference *dr;
98 *dependence_steps = 0;
99 *nb_deps_not_carried_by_loop = 0;
100 *access_strides = double_int_zero;
102 FOR_EACH_VEC_ELT (ddr_p, dependence_relations, i, ddr)
104 /* If we don't know anything about this dependence, or the distance
105 vector is NULL, or there is no dependence, then there is no reuse of
106 data. */
107 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
108 || DDR_ARE_DEPENDENT (ddr) == chrec_known
109 || DDR_NUM_DIST_VECTS (ddr) == 0)
110 continue;
112 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
114 int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth (first_loop)];
116 if (dist == 0)
117 (*nb_deps_not_carried_by_loop) += 1;
119 else if (dist < 0)
120 (*dependence_steps) += -dist;
122 else
123 (*dependence_steps) += dist;
127 /* Compute the access strides. */
128 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
130 unsigned int it;
131 tree ref = DR_REF (dr);
132 gimple stmt = DR_STMT (dr);
133 struct loop *stmt_loop = loop_containing_stmt (stmt);
134 struct loop *inner_loop = first_loop->inner;
136 if (inner_loop != stmt_loop
137 && !flow_loop_nested_p (inner_loop, stmt_loop))
138 continue;
140 for (it = 0; it < DR_NUM_DIMENSIONS (dr);
141 it++, ref = TREE_OPERAND (ref, 0))
143 int num = am_vector_index_for_loop (DR_ACCESS_MATRIX (dr), loop->num);
144 int istride = AM_GET_ACCESS_MATRIX_ELEMENT (DR_ACCESS_MATRIX (dr), it, num);
145 tree array_size = TYPE_SIZE (TREE_TYPE (ref));
146 double_int dstride;
148 if (array_size == NULL_TREE
149 || TREE_CODE (array_size) != INTEGER_CST)
150 continue;
152 dstride = double_int_mul (tree_to_double_int (array_size),
153 shwi_to_double_int (istride));
154 (*access_strides) = double_int_add (*access_strides, dstride);
159 /* Attempt to apply interchange transformations to TRANS to maximize the
160 spatial and temporal locality of the loop.
161 Returns the new transform matrix. The smaller the reuse vector
162 distances in the inner loops, the fewer the cache misses.
163 FIRST_LOOP is the loop->num of the first loop in the analyzed loop
164 nest. */
167 static lambda_trans_matrix
168 try_interchange_loops (lambda_trans_matrix trans,
169 unsigned int depth,
170 VEC (ddr_p, heap) *dependence_relations,
171 VEC (data_reference_p, heap) *datarefs,
172 struct loop *first_loop)
174 bool res;
175 struct loop *loop_i;
176 struct loop *loop_j;
177 unsigned int dependence_steps_i, dependence_steps_j;
178 double_int access_strides_i, access_strides_j;
179 double_int small, large, nb_iter;
180 double_int l1_cache_size, l2_cache_size;
181 int cmp;
182 unsigned int nb_deps_not_carried_by_i, nb_deps_not_carried_by_j;
183 struct data_dependence_relation *ddr;
185 if (VEC_length (ddr_p, dependence_relations) == 0)
186 return trans;
188 /* When there is an unknown relation in the dependence_relations, we
189 know that it is no worth looking at this loop nest: give up. */
190 ddr = VEC_index (ddr_p, dependence_relations, 0);
191 if (ddr == NULL || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
192 return trans;
194 l1_cache_size = uhwi_to_double_int (L1_CACHE_SIZE * 1024);
195 l2_cache_size = uhwi_to_double_int (L2_CACHE_SIZE * 1024);
197 /* LOOP_I is always the outer loop. */
198 for (loop_j = first_loop->inner;
199 loop_j;
200 loop_j = loop_j->inner)
201 for (loop_i = first_loop;
202 loop_depth (loop_i) < loop_depth (loop_j);
203 loop_i = loop_i->inner)
205 gather_interchange_stats (dependence_relations, datarefs,
206 loop_i, first_loop,
207 &dependence_steps_i,
208 &nb_deps_not_carried_by_i,
209 &access_strides_i);
210 gather_interchange_stats (dependence_relations, datarefs,
211 loop_j, first_loop,
212 &dependence_steps_j,
213 &nb_deps_not_carried_by_j,
214 &access_strides_j);
216 /* Heuristics for loop interchange profitability:
218 0. Don't transform if the smallest stride is larger than
219 the L2 cache, or if the largest stride multiplied by the
220 number of iterations is smaller than the L1 cache.
222 1. (spatial locality) Inner loops should have smallest
223 dependence steps.
225 2. (spatial locality) Inner loops should contain more
226 dependence relations not carried by the loop.
228 3. (temporal locality) Inner loops should have smallest
229 array access strides.
232 cmp = double_int_ucmp (access_strides_i, access_strides_j);
233 small = cmp < 0 ? access_strides_i : access_strides_j;
234 large = cmp < 0 ? access_strides_j : access_strides_i;
236 if (double_int_ucmp (small, l2_cache_size) > 0)
237 continue;
239 res = cmp < 0 ?
240 estimated_loop_iterations (loop_j, false, &nb_iter):
241 estimated_loop_iterations (loop_i, false, &nb_iter);
243 if (res
244 && double_int_ucmp (double_int_mul (large, nb_iter),
245 l1_cache_size) < 0)
246 continue;
248 if (dependence_steps_i < dependence_steps_j
249 || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
250 || cmp < 0)
252 lambda_matrix_row_exchange (LTM_MATRIX (trans),
253 loop_depth (loop_i) - loop_depth (first_loop),
254 loop_depth (loop_j) - loop_depth (first_loop));
255 /* Validate the resulting matrix. When the transformation
256 is not valid, reverse to the previous transformation. */
257 if (!lambda_transform_legal_p (trans, depth, dependence_relations))
258 lambda_matrix_row_exchange (LTM_MATRIX (trans),
259 loop_depth (loop_i) - loop_depth (first_loop),
260 loop_depth (loop_j) - loop_depth (first_loop));
264 return trans;
267 /* Return the number of nested loops in LOOP_NEST, or 0 if the loops
268 are not perfectly nested. */
270 unsigned int
271 perfect_loop_nest_depth (struct loop *loop_nest)
273 struct loop *temp;
274 unsigned int depth = 1;
276 /* If it's not a loop nest, we don't want it. We also don't handle
277 sibling loops properly, which are loops of the following form:
279 | for (i = 0; i < 50; i++)
281 | for (j = 0; j < 50; j++)
283 | ...
285 | for (j = 0; j < 50; j++)
287 | ...
292 if (!loop_nest->inner || !single_exit (loop_nest))
293 return 0;
295 for (temp = loop_nest->inner; temp; temp = temp->inner)
297 /* If we have a sibling loop or multiple exit edges, jump ship. */
298 if (temp->next || !single_exit (temp))
299 return 0;
301 depth++;
304 return depth;
307 /* Perform a set of linear transforms on loops. */
309 void
310 linear_transform_loops (void)
312 bool modified = false;
313 loop_iterator li;
314 VEC(tree,heap) *oldivs = NULL;
315 VEC(tree,heap) *invariants = NULL;
316 VEC(tree,heap) *lambda_parameters = NULL;
317 VEC(gimple,heap) *remove_ivs = VEC_alloc (gimple, heap, 3);
318 struct loop *loop_nest;
319 gimple oldiv_stmt;
320 unsigned i;
322 FOR_EACH_LOOP (li, loop_nest, 0)
324 unsigned int depth = 0;
325 VEC (ddr_p, heap) *dependence_relations;
326 VEC (data_reference_p, heap) *datarefs;
328 lambda_loopnest before, after;
329 lambda_trans_matrix trans;
330 struct obstack lambda_obstack;
331 struct loop *loop;
332 VEC(loop_p,heap) *nest;
334 depth = perfect_loop_nest_depth (loop_nest);
335 if (depth == 0)
336 continue;
338 nest = VEC_alloc (loop_p, heap, 3);
339 for (loop = loop_nest; loop; loop = loop->inner)
340 VEC_safe_push (loop_p, heap, nest, loop);
342 gcc_obstack_init (&lambda_obstack);
343 VEC_truncate (tree, oldivs, 0);
344 VEC_truncate (tree, invariants, 0);
345 VEC_truncate (tree, lambda_parameters, 0);
347 datarefs = VEC_alloc (data_reference_p, heap, 10);
348 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
349 if (!compute_data_dependences_for_loop (loop_nest, true, &datarefs,
350 &dependence_relations))
351 goto free_and_continue;
353 lambda_collect_parameters (datarefs, &lambda_parameters);
354 if (!lambda_compute_access_matrices (datarefs, lambda_parameters,
355 nest, &lambda_obstack))
356 goto free_and_continue;
358 if (dump_file && (dump_flags & TDF_DETAILS))
359 dump_ddrs (dump_file, dependence_relations);
361 /* Build the transformation matrix. */
362 trans = lambda_trans_matrix_new (depth, depth, &lambda_obstack);
363 lambda_matrix_id (LTM_MATRIX (trans), depth);
364 trans = try_interchange_loops (trans, depth, dependence_relations,
365 datarefs, loop_nest);
367 if (lambda_trans_matrix_id_p (trans))
369 if (dump_file)
370 fprintf (dump_file, "Won't transform loop. Optimal transform is the identity transform\n");
371 goto free_and_continue;
374 /* Check whether the transformation is legal. */
375 if (!lambda_transform_legal_p (trans, depth, dependence_relations))
377 if (dump_file)
378 fprintf (dump_file, "Can't transform loop, transform is illegal:\n");
379 goto free_and_continue;
382 before = gcc_loopnest_to_lambda_loopnest (loop_nest, &oldivs,
383 &invariants, &lambda_obstack);
385 if (!before)
386 goto free_and_continue;
388 if (dump_file)
390 fprintf (dump_file, "Before:\n");
391 print_lambda_loopnest (dump_file, before, 'i');
394 after = lambda_loopnest_transform (before, trans, &lambda_obstack);
396 if (dump_file)
398 fprintf (dump_file, "After:\n");
399 print_lambda_loopnest (dump_file, after, 'u');
402 lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
403 &remove_ivs,
404 after, trans, &lambda_obstack);
405 modified = true;
407 if (dump_file)
408 fprintf (dump_file, "Successfully transformed loop.\n");
410 free_and_continue:
411 obstack_free (&lambda_obstack, NULL);
412 free_dependence_relations (dependence_relations);
413 free_data_refs (datarefs);
414 VEC_free (loop_p, heap, nest);
417 FOR_EACH_VEC_ELT (gimple, remove_ivs, i, oldiv_stmt)
418 remove_iv (oldiv_stmt);
420 VEC_free (tree, heap, oldivs);
421 VEC_free (tree, heap, invariants);
422 VEC_free (gimple, heap, remove_ivs);
423 scev_reset ();
425 if (modified)
426 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_full_phi);