[Ada] Do not perform useless work in Check_No_Parts_Violations
[official-gcc.git] / gcc / gimple-loop-jam.c
blob4842f0dff80e086f928edecf63944e830fc8a5ea
1 /* Loop unroll-and-jam.
2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tree-pass.h"
24 #include "backend.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "ssa.h"
28 #include "fold-const.h"
29 #include "tree-cfg.h"
30 #include "tree-ssa.h"
31 #include "tree-ssa-loop-niter.h"
32 #include "tree-ssa-loop.h"
33 #include "tree-ssa-loop-manip.h"
34 #include "cfgloop.h"
35 #include "tree-scalar-evolution.h"
36 #include "gimple-iterator.h"
37 #include "cfghooks.h"
38 #include "tree-data-ref.h"
39 #include "tree-ssa-loop-ivopts.h"
40 #include "tree-vectorizer.h"
42 /* Unroll and Jam transformation
44 This is a combination of two transformations, where the second
45 is not always valid. It's applicable if a loop nest has redundancies
46 over the iterations of an outer loop while not having that with
47 an inner loop.
49 Given this nest:
50 for (i) {
51 for (j) {
52 B(i,j)
56 first unroll:
57 for (i by 2) {
58 for (j) {
59 B(i,j)
61 for (j) {
62 B(i+1,j)
66 then fuse the two adjacent inner loops resulting from that:
67 for (i by 2) {
68 for (j) {
69 B(i,j)
70 B(i+1,j)
74 As the order of evaluations of the body B changes this is valid
75 only in certain situations: all distance vectors need to be forward.
76 Additionally if there are multiple induction variables than just
77 a counting control IV (j above) we can also deal with some situations.
79 The validity is checked by unroll_jam_possible_p, and the data-dep
80 testing below.
82 A trivial example where the fusion is wrong would be when
83 B(i,j) == x[j-1] = x[j];
84 for (i by 2) {
85 for (j) {
86 x[j-1] = x[j];
88 for (j) {
89 x[j-1] = x[j];
91 } effect: move content to front by two elements
92 -->
93 for (i by 2) {
94 for (j) {
95 x[j-1] = x[j];
96 x[j-1] = x[j];
98 } effect: move content to front by one element
101 /* Modify the loop tree for the fact that all code once belonging
102 to the OLD loop or the outer loop of OLD now is inside LOOP. */
104 static void
105 merge_loop_tree (class loop *loop, class loop *old)
107 basic_block *bbs;
108 int i, n;
109 class loop *subloop;
110 edge e;
111 edge_iterator ei;
113 /* Find its nodes. */
114 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
115 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117 for (i = 0; i < n; i++)
119 /* If the block was direct child of OLD loop it's now part
120 of LOOP. If it was outside OLD, then it moved into LOOP
121 as well. This avoids changing the loop father for BBs
122 in inner loops of OLD. */
123 if (bbs[i]->loop_father == old
124 || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126 remove_bb_from_loops (bbs[i]);
127 add_bb_to_loop (bbs[i], loop);
128 continue;
131 /* If we find a direct subloop of OLD, move it to LOOP. */
132 subloop = bbs[i]->loop_father;
133 if (loop_outer (subloop) == old && subloop->header == bbs[i])
135 flow_loop_tree_node_remove (subloop);
136 flow_loop_tree_node_add (loop, subloop);
140 /* Update the information about loop exit edges. */
141 for (i = 0; i < n; i++)
143 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145 rescan_loop_exit (e, false, false);
149 loop->num_nodes = n;
151 free (bbs);
154 /* BB is part of the outer loop of an unroll-and-jam situation.
155 Check if any statements therein would prevent the transformation. */
157 static bool
158 bb_prevents_fusion_p (basic_block bb)
160 gimple_stmt_iterator gsi;
161 /* BB is duplicated by outer unrolling and then all N-1 first copies
162 move into the body of the fused inner loop. If BB exits the outer loop
163 the last copy still does so, and the first N-1 copies are cancelled
164 by loop unrolling, so also after fusion it's the exit block.
165 But there might be other reasons that prevent fusion:
166 * stores or unknown side-effects prevent fusion
167 * loads don't
168 * computations into SSA names: these aren't problematic. Their
169 result will be unused on the exit edges of the first N-1 copies
170 (those aren't taken after unrolling). If they are used on the
171 other edge (the one leading to the outer latch block) they are
172 loop-carried (on the outer loop) and the Nth copy of BB will
173 compute them again (i.e. the first N-1 copies will be dead). */
174 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
176 gimple *g = gsi_stmt (gsi);
177 if (gimple_vdef (g) || gimple_has_side_effects (g))
178 return true;
180 return false;
183 /* Given an inner loop LOOP (of some OUTER loop) determine if
184 we can safely fuse copies of it (generated by outer unrolling).
185 If so return true, otherwise return false. */
187 static bool
188 unroll_jam_possible_p (class loop *outer, class loop *loop)
190 basic_block *bbs;
191 int i, n;
192 class tree_niter_desc niter;
194 /* When fusing the loops we skip the latch block
195 of the first one, so it mustn't have any effects to
196 preserve. */
197 if (!empty_block_p (loop->latch))
198 return false;
200 if (!single_exit (loop))
201 return false;
203 /* We need a perfect nest. Quick check for adjacent inner loops. */
204 if (outer->inner != loop || loop->next)
205 return false;
207 /* Prevent head-controlled inner loops, that we usually have.
208 The guard block would need to be accepted
209 (invariant condition either entering or skipping the loop),
210 without also accepting arbitrary control flow. When unswitching
211 ran before us (as with -O3) this won't be a problem because its
212 outer loop unswitching will have moved out the invariant condition.
214 If we do that we need to extend fuse_loops() to cope with this
215 by threading through the (still invariant) copied condition
216 between the two loop copies. */
217 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
218 return false;
220 /* The number of iterations of the inner loop must be loop invariant
221 with respect to the outer loop. */
222 if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
223 false, true)
224 || niter.cmp == ERROR_MARK
225 || !integer_zerop (niter.may_be_zero)
226 || !expr_invariant_in_loop_p (outer, niter.niter))
227 return false;
229 /* If the inner loop produces any values that are used inside the
230 outer loop (except the virtual op) then it can flow
231 back (perhaps indirectly) into the inner loop. This prevents
232 fusion: without fusion the value at the last iteration is used,
233 with fusion the value after the initial iteration is used.
235 If all uses are outside the outer loop this doesn't prevent fusion;
236 the value of the last iteration is still used (and the values from
237 all intermediate iterations are dead). */
238 gphi_iterator psi;
239 for (psi = gsi_start_phis (single_exit (loop)->dest);
240 !gsi_end_p (psi); gsi_next (&psi))
242 imm_use_iterator imm_iter;
243 use_operand_p use_p;
244 tree op = gimple_phi_result (psi.phi ());
245 if (virtual_operand_p (op))
246 continue;
247 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
249 gimple *use_stmt = USE_STMT (use_p);
250 if (!is_gimple_debug (use_stmt)
251 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
252 return false;
256 /* And check blocks belonging to just outer loop. */
257 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
258 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
260 for (i = 0; i < n; i++)
261 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
262 break;
263 free (bbs);
264 if (i != n)
265 return false;
267 /* For now we can safely fuse copies of LOOP only if all
268 loop carried variables are inductions (or the virtual op).
270 We could handle reductions as well (the initial value in the second
271 body would be the after-iter value of the first body) if it's over
272 an associative and commutative operation. We wouldn't
273 be able to handle unknown cycles. */
274 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
276 affine_iv iv;
277 tree op = gimple_phi_result (psi.phi ());
279 if (virtual_operand_p (op))
280 continue;
281 if (!simple_iv (loop, loop, op, &iv, true))
282 return false;
283 /* The inductions must be regular, loop invariant step and initial
284 value. */
285 if (!expr_invariant_in_loop_p (outer, iv.step)
286 || !expr_invariant_in_loop_p (outer, iv.base))
287 return false;
288 /* XXX With more effort we could also be able to deal with inductions
289 where the initial value is loop variant but a simple IV in the
290 outer loop. The initial value for the second body would be
291 the original initial value plus iv.base.step. The next value
292 for the fused loop would be the original next value of the first
293 copy, _not_ the next value of the second body. */
296 return true;
299 /* Fuse LOOP with all further neighbors. The loops are expected to
300 be in appropriate form. */
302 static void
303 fuse_loops (class loop *loop)
305 class loop *next = loop->next;
307 while (next)
309 edge e;
311 remove_branch (single_pred_edge (loop->latch));
312 /* Make delete_basic_block not fiddle with the loop structure. */
313 basic_block oldlatch = loop->latch;
314 loop->latch = NULL;
315 delete_basic_block (oldlatch);
316 e = redirect_edge_and_branch (loop_latch_edge (next),
317 loop->header);
318 loop->latch = e->src;
319 flush_pending_stmts (e);
321 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
323 /* The PHI nodes of the second body (single-argument now)
324 need adjustments to use the right values: either directly
325 the value of the corresponding PHI in the first copy or
326 the one leaving the first body which unrolling did for us.
328 See also unroll_jam_possible_p() for further possibilities. */
329 gphi_iterator psi_first, psi_second;
330 e = single_pred_edge (next->header);
331 for (psi_first = gsi_start_phis (loop->header),
332 psi_second = gsi_start_phis (next->header);
333 !gsi_end_p (psi_first);
334 gsi_next (&psi_first), gsi_next (&psi_second))
336 gphi *phi_first = psi_first.phi ();
337 gphi *phi_second = psi_second.phi ();
338 tree firstop = gimple_phi_result (phi_first);
339 /* The virtual operand is correct already as it's
340 always live at exit, hence has a LCSSA node and outer
341 loop unrolling updated SSA form. */
342 if (virtual_operand_p (firstop))
343 continue;
345 /* Due to unroll_jam_possible_p() we know that this is
346 an induction. The second body goes over the same
347 iteration space. */
348 add_phi_arg (phi_second, firstop, e,
349 gimple_location (phi_first));
351 gcc_assert (gsi_end_p (psi_second));
353 merge_loop_tree (loop, next);
354 gcc_assert (!next->num_nodes);
355 class loop *ln = next->next;
356 delete_loop (next);
357 next = ln;
359 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
362 /* Return true if any of the access functions for dataref A
363 isn't invariant with respect to loop LOOP_NEST. */
364 static bool
365 any_access_function_variant_p (const struct data_reference *a,
366 const class loop *loop_nest)
368 vec<tree> fns = DR_ACCESS_FNS (a);
370 for (tree t : fns)
371 if (!evolution_function_is_invariant_p (t, loop_nest->num))
372 return true;
374 return false;
377 /* Returns true if the distance in DDR can be determined and adjusts
378 the unroll factor in *UNROLL to make unrolling valid for that distance.
379 Otherwise return false. DDR is with respect to the outer loop of INNER.
381 If this data dep can lead to a removed memory reference, increment
382 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
383 for this to happen. */
385 static bool
386 adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
387 unsigned *unroll, unsigned *profit_unroll,
388 unsigned *removed)
390 bool ret = false;
391 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
393 if (DDR_NUM_DIST_VECTS (ddr) == 0)
394 return false;
395 unsigned i;
396 lambda_vector dist_v;
397 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
399 /* A distance (a,b) is at worst transformed into (a/N,b) by the
400 unrolling (factor N), so the transformation is valid if
401 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
402 factor needs to be limited so that the first condition holds.
403 That may limit the factor down to zero in the worst case. */
404 int dist = dist_v[0];
405 if (dist < 0)
406 gcc_unreachable ();
407 else if ((unsigned)dist >= *unroll)
409 else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
411 /* We have (a,0) with a < N, so this will be transformed into
412 (0,0) after unrolling by N. This might potentially be a
413 problem, if it's not a read-read dependency. */
414 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
416 else
418 /* So, at least one is a write, and we might reduce the
419 distance vector to (0,0). This is still no problem
420 if both data-refs are affine with respect to the inner
421 loops. But if one of them is invariant with respect
422 to an inner loop our reordering implicit in loop fusion
423 corrupts the program, as our data dependences don't
424 capture this. E.g. for:
425 for (0 <= i < n)
426 for (0 <= j < m)
427 a[i][0] = a[i+1][0] + 2; // (1)
428 b[i][j] = b[i+1][j] + 2; // (2)
429 the distance vector for both statements is (-1,0),
430 but exchanging the order for (2) is okay, while
431 for (1) it is not. To see this, write out the original
432 accesses (assume m is 2):
433 a i j original
434 0 0 0 r a[1][0] b[1][0]
435 1 0 0 w a[0][0] b[0][0]
436 2 0 1 r a[1][0] b[1][1]
437 3 0 1 w a[0][0] b[0][1]
438 4 1 0 r a[2][0] b[2][0]
439 5 1 0 w a[1][0] b[1][0]
440 after unroll-by-2 and fusion the accesses are done in
441 this order (from column a): 0,1, 4,5, 2,3, i.e. this:
442 a i j transformed
443 0 0 0 r a[1][0] b[1][0]
444 1 0 0 w a[0][0] b[0][0]
445 4 1 0 r a[2][0] b[2][0]
446 5 1 0 w a[1][0] b[1][0]
447 2 0 1 r a[1][0] b[1][1]
448 3 0 1 w a[0][0] b[0][1]
449 Note how access 2 accesses the same element as access 5
450 for array 'a' but not for array 'b'. */
451 if (any_access_function_variant_p (DDR_A (ddr), inner)
452 && any_access_function_variant_p (DDR_B (ddr), inner))
454 else
455 /* And if any dataref of this pair is invariant with
456 respect to the inner loop, we have no chance than
457 to reduce the unroll factor. */
458 *unroll = dist;
461 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
463 else
464 *unroll = dist;
466 /* With a distance (a,0) it's always profitable to unroll-and-jam
467 (by a+1), because one memory reference will go away. With
468 (a,b) and b != 0 that's less clear. We will increase the
469 number of streams without lowering the number of mem refs.
470 So for now only handle the first situation. */
471 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
473 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
474 (*removed)++;
477 ret = true;
480 return ret;
483 /* Main entry point for the unroll-and-jam transformation
484 described above. */
486 static unsigned int
487 tree_loop_unroll_and_jam (void)
489 class loop *loop;
490 bool changed = false;
492 gcc_assert (scev_initialized_p ());
494 /* Go through all innermost loops. */
495 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
497 class loop *outer = loop_outer (loop);
499 if (loop_depth (loop) < 2
500 || optimize_loop_nest_for_size_p (outer))
501 continue;
503 if (!unroll_jam_possible_p (outer, loop))
504 continue;
506 vec<data_reference_p> datarefs = vNULL;
507 vec<ddr_p> dependences = vNULL;
508 unsigned unroll_factor, profit_unroll, removed;
509 class tree_niter_desc desc;
510 bool unroll = false;
512 auto_vec<loop_p, 3> loop_nest;
513 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
514 &datarefs, &dependences))
516 if (dump_file && (dump_flags & TDF_DETAILS))
517 fprintf (dump_file, "Cannot analyze data dependencies\n");
518 free_data_refs (datarefs);
519 free_dependence_relations (dependences);
520 continue;
522 if (!datarefs.length ())
523 continue;
525 if (dump_file && (dump_flags & TDF_DETAILS))
526 dump_data_dependence_relations (dump_file, dependences);
528 unroll_factor = (unsigned)-1;
529 profit_unroll = 1;
530 removed = 0;
532 /* Check all dependencies. */
533 unsigned i;
534 struct data_dependence_relation *ddr;
535 FOR_EACH_VEC_ELT (dependences, i, ddr)
537 struct data_reference *dra, *drb;
539 /* If the refs are independend there's nothing to do. */
540 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
541 continue;
542 dra = DDR_A (ddr);
543 drb = DDR_B (ddr);
544 /* Nothing interesting for the self dependencies. */
545 if (dra == drb)
546 continue;
548 /* Now check the distance vector, for determining a sensible
549 outer unroll factor, and for validity of merging the inner
550 loop copies. */
551 if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
552 &removed))
554 /* Couldn't get the distance vector. For two reads that's
555 harmless (we assume we should unroll). For at least
556 one write this means we can't check the dependence direction
557 and hence can't determine safety. */
559 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
561 unroll_factor = 0;
562 break;
567 /* We regard a user-specified minimum percentage of zero as a request
568 to ignore all profitability concerns and apply the transformation
569 always. */
570 if (!param_unroll_jam_min_percent)
571 profit_unroll = MAX(2, profit_unroll);
572 else if (removed * 100 / datarefs.length ()
573 < (unsigned)param_unroll_jam_min_percent)
574 profit_unroll = 1;
575 if (unroll_factor > profit_unroll)
576 unroll_factor = profit_unroll;
577 if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
578 unroll_factor = param_unroll_jam_max_unroll;
579 unroll = (unroll_factor > 1
580 && can_unroll_loop_p (outer, unroll_factor, &desc));
582 if (unroll)
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
586 find_loop_location (outer),
587 "applying unroll and jam with factor %d\n",
588 unroll_factor);
589 initialize_original_copy_tables ();
590 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
591 &desc);
592 free_original_copy_tables ();
593 fuse_loops (outer->inner);
594 changed = true;
597 loop_nest.release ();
598 free_dependence_relations (dependences);
599 free_data_refs (datarefs);
602 if (changed)
604 scev_reset ();
605 free_dominance_info (CDI_DOMINATORS);
606 return TODO_cleanup_cfg;
608 return 0;
611 /* Pass boilerplate */
613 namespace {
615 const pass_data pass_data_loop_jam =
617 GIMPLE_PASS, /* type */
618 "unrolljam", /* name */
619 OPTGROUP_LOOP, /* optinfo_flags */
620 TV_LOOP_JAM, /* tv_id */
621 PROP_cfg, /* properties_required */
622 0, /* properties_provided */
623 0, /* properties_destroyed */
624 0, /* todo_flags_start */
625 0, /* todo_flags_finish */
628 class pass_loop_jam : public gimple_opt_pass
630 public:
631 pass_loop_jam (gcc::context *ctxt)
632 : gimple_opt_pass (pass_data_loop_jam, ctxt)
635 /* opt_pass methods: */
636 virtual bool gate (function *) { return flag_unroll_jam != 0; }
637 virtual unsigned int execute (function *);
641 unsigned int
642 pass_loop_jam::execute (function *fun)
644 if (number_of_loops (fun) <= 1)
645 return 0;
647 return tree_loop_unroll_and_jam ();
652 gimple_opt_pass *
653 make_pass_loop_jam (gcc::context *ctxt)
655 return new pass_loop_jam (ctxt);