c++: using from enclosing class template [PR105006]
[official-gcc.git] / gcc / gimple-loop-jam.cc
blobe33dd9091df32a9078e152a609eece174ce938a0
1 /* Loop unroll-and-jam.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tree-pass.h"
24 #include "backend.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "ssa.h"
28 #include "fold-const.h"
29 #include "tree-cfg.h"
30 #include "tree-ssa.h"
31 #include "tree-ssa-loop-niter.h"
32 #include "tree-ssa-loop.h"
33 #include "tree-ssa-loop-manip.h"
34 #include "cfgloop.h"
35 #include "tree-scalar-evolution.h"
36 #include "gimple-iterator.h"
37 #include "cfghooks.h"
38 #include "tree-data-ref.h"
39 #include "tree-ssa-loop-ivopts.h"
40 #include "tree-vectorizer.h"
41 #include "tree-ssa-sccvn.h"
43 /* Unroll and Jam transformation
45 This is a combination of two transformations, where the second
46 is not always valid. It's applicable if a loop nest has redundancies
47 over the iterations of an outer loop while not having that with
48 an inner loop.
50 Given this nest:
51 for (i) {
52 for (j) {
53 B(i,j)
57 first unroll:
58 for (i by 2) {
59 for (j) {
60 B(i,j)
62 for (j) {
63 B(i+1,j)
67 then fuse the two adjacent inner loops resulting from that:
68 for (i by 2) {
69 for (j) {
70 B(i,j)
71 B(i+1,j)
75 As the order of evaluations of the body B changes this is valid
76 only in certain situations: all distance vectors need to be forward.
77 Additionally if there are multiple induction variables than just
78 a counting control IV (j above) we can also deal with some situations.
80 The validity is checked by unroll_jam_possible_p, and the data-dep
81 testing below.
83 A trivial example where the fusion is wrong would be when
84 B(i,j) == x[j-1] = x[j];
85 for (i by 2) {
86 for (j) {
87 x[j-1] = x[j];
89 for (j) {
90 x[j-1] = x[j];
92 } effect: move content to front by two elements
93 -->
94 for (i by 2) {
95 for (j) {
96 x[j-1] = x[j];
97 x[j-1] = x[j];
99 } effect: move content to front by one element
102 /* Modify the loop tree for the fact that all code once belonging
103 to the OLD loop or the outer loop of OLD now is inside LOOP. */
105 static void
106 merge_loop_tree (class loop *loop, class loop *old)
108 basic_block *bbs;
109 int i, n;
110 class loop *subloop;
111 edge e;
112 edge_iterator ei;
114 /* Find its nodes. */
115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
118 for (i = 0; i < n; i++)
120 /* If the block was direct child of OLD loop it's now part
121 of LOOP. If it was outside OLD, then it moved into LOOP
122 as well. This avoids changing the loop father for BBs
123 in inner loops of OLD. */
124 if (bbs[i]->loop_father == old
125 || loop_depth (bbs[i]->loop_father) < loop_depth (old))
127 remove_bb_from_loops (bbs[i]);
128 add_bb_to_loop (bbs[i], loop);
129 continue;
132 /* If we find a direct subloop of OLD, move it to LOOP. */
133 subloop = bbs[i]->loop_father;
134 if (loop_outer (subloop) == old && subloop->header == bbs[i])
136 flow_loop_tree_node_remove (subloop);
137 flow_loop_tree_node_add (loop, subloop);
141 /* Update the information about loop exit edges. */
142 for (i = 0; i < n; i++)
144 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
146 rescan_loop_exit (e, false, false);
150 loop->num_nodes = n;
152 free (bbs);
155 /* BB is part of the outer loop of an unroll-and-jam situation.
156 Check if any statements therein would prevent the transformation. */
158 static bool
159 bb_prevents_fusion_p (basic_block bb)
161 gimple_stmt_iterator gsi;
162 /* BB is duplicated by outer unrolling and then all N-1 first copies
163 move into the body of the fused inner loop. If BB exits the outer loop
164 the last copy still does so, and the first N-1 copies are cancelled
165 by loop unrolling, so also after fusion it's the exit block.
166 But there might be other reasons that prevent fusion:
167 * stores or unknown side-effects prevent fusion
168 * loads don't
169 * computations into SSA names: these aren't problematic. Their
170 result will be unused on the exit edges of the first N-1 copies
171 (those aren't taken after unrolling). If they are used on the
172 other edge (the one leading to the outer latch block) they are
173 loop-carried (on the outer loop) and the Nth copy of BB will
174 compute them again (i.e. the first N-1 copies will be dead). */
175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
177 gimple *g = gsi_stmt (gsi);
178 if (gimple_vdef (g) || gimple_has_side_effects (g))
179 return true;
181 return false;
184 /* Given an inner loop LOOP (of some OUTER loop) determine if
185 we can safely fuse copies of it (generated by outer unrolling).
186 If so return true, otherwise return false. */
188 static bool
189 unroll_jam_possible_p (class loop *outer, class loop *loop)
191 basic_block *bbs;
192 int i, n;
193 class tree_niter_desc niter;
195 /* When fusing the loops we skip the latch block
196 of the first one, so it mustn't have any effects to
197 preserve. */
198 if (!empty_block_p (loop->latch))
199 return false;
201 edge exit;
202 if (!(exit = single_exit (loop)))
203 return false;
205 /* We need a perfect nest. Quick check for adjacent inner loops. */
206 if (outer->inner != loop || loop->next)
207 return false;
209 /* Prevent head-controlled inner loops, that we usually have.
210 The guard block would need to be accepted
211 (invariant condition either entering or skipping the loop),
212 without also accepting arbitrary control flow. When unswitching
213 ran before us (as with -O3) this won't be a problem because its
214 outer loop unswitching will have moved out the invariant condition.
216 If we do that we need to extend fuse_loops() to cope with this
217 by threading through the (still invariant) copied condition
218 between the two loop copies. */
219 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
220 return false;
222 /* The number of iterations of the inner loop must be loop invariant
223 with respect to the outer loop. */
224 if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
225 false, true)
226 || niter.cmp == ERROR_MARK
227 || !integer_zerop (niter.may_be_zero)
228 || !expr_invariant_in_loop_p (outer, niter.niter))
229 return false;
231 /* If the inner loop produces any values that are used inside the
232 outer loop (except the virtual op) then it can flow
233 back (perhaps indirectly) into the inner loop. This prevents
234 fusion: without fusion the value at the last iteration is used,
235 with fusion the value after the initial iteration is used.
237 If all uses are outside the outer loop this doesn't prevent fusion;
238 the value of the last iteration is still used (and the values from
239 all intermediate iterations are dead). */
240 gphi_iterator psi;
241 for (psi = gsi_start_phis (single_exit (loop)->dest);
242 !gsi_end_p (psi); gsi_next (&psi))
244 imm_use_iterator imm_iter;
245 use_operand_p use_p;
246 tree op = gimple_phi_result (psi.phi ());
247 if (virtual_operand_p (op))
248 continue;
249 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
251 gimple *use_stmt = USE_STMT (use_p);
252 if (!is_gimple_debug (use_stmt)
253 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
254 return false;
258 /* And check blocks belonging to just outer loop. */
259 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
260 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
262 for (i = 0; i < n; i++)
263 if (bbs[i]->loop_father == outer
264 && (bb_prevents_fusion_p (bbs[i])
265 /* Outer loop exits must come after the inner loop, otherwise
266 we'll put the outer loop exit into the fused inner loop. */
267 || (loop_exits_from_bb_p (outer, bbs[i])
268 && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
269 break;
270 free (bbs);
271 if (i != n)
272 return false;
274 /* For now we can safely fuse copies of LOOP only if all
275 loop carried variables are inductions (or the virtual op).
277 We could handle reductions as well (the initial value in the second
278 body would be the after-iter value of the first body) if it's over
279 an associative and commutative operation. We wouldn't
280 be able to handle unknown cycles. */
281 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
283 affine_iv iv;
284 tree op = gimple_phi_result (psi.phi ());
286 if (virtual_operand_p (op))
287 continue;
288 if (!simple_iv (loop, loop, op, &iv, true))
289 return false;
290 /* The inductions must be regular, loop invariant step and initial
291 value. */
292 if (!expr_invariant_in_loop_p (outer, iv.step)
293 || !expr_invariant_in_loop_p (outer, iv.base))
294 return false;
295 /* XXX With more effort we could also be able to deal with inductions
296 where the initial value is loop variant but a simple IV in the
297 outer loop. The initial value for the second body would be
298 the original initial value plus iv.base.step. The next value
299 for the fused loop would be the original next value of the first
300 copy, _not_ the next value of the second body. */
303 return true;
306 /* Fuse LOOP with all further neighbors. The loops are expected to
307 be in appropriate form. */
309 static void
310 fuse_loops (class loop *loop)
312 class loop *next = loop->next;
314 while (next)
316 edge e;
318 remove_branch (single_pred_edge (loop->latch));
319 /* Make delete_basic_block not fiddle with the loop structure. */
320 basic_block oldlatch = loop->latch;
321 loop->latch = NULL;
322 delete_basic_block (oldlatch);
323 e = redirect_edge_and_branch (loop_latch_edge (next),
324 loop->header);
325 loop->latch = e->src;
326 flush_pending_stmts (e);
328 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
330 /* The PHI nodes of the second body (single-argument now)
331 need adjustments to use the right values: either directly
332 the value of the corresponding PHI in the first copy or
333 the one leaving the first body which unrolling did for us.
335 See also unroll_jam_possible_p() for further possibilities. */
336 gphi_iterator psi_first, psi_second;
337 e = single_pred_edge (next->header);
338 for (psi_first = gsi_start_phis (loop->header),
339 psi_second = gsi_start_phis (next->header);
340 !gsi_end_p (psi_first);
341 gsi_next (&psi_first), gsi_next (&psi_second))
343 gphi *phi_first = psi_first.phi ();
344 gphi *phi_second = psi_second.phi ();
345 tree firstop = gimple_phi_result (phi_first);
346 /* The virtual operand is correct already as it's
347 always live at exit, hence has a LCSSA node and outer
348 loop unrolling updated SSA form. */
349 if (virtual_operand_p (firstop))
350 continue;
352 /* Due to unroll_jam_possible_p() we know that this is
353 an induction. The second body goes over the same
354 iteration space. */
355 add_phi_arg (phi_second, firstop, e,
356 gimple_location (phi_first));
358 gcc_assert (gsi_end_p (psi_second));
360 merge_loop_tree (loop, next);
361 gcc_assert (!next->num_nodes);
362 class loop *ln = next->next;
363 delete_loop (next);
364 next = ln;
366 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
369 /* Return true if any of the access functions for dataref A
370 isn't invariant with respect to loop LOOP_NEST. */
371 static bool
372 any_access_function_variant_p (const struct data_reference *a,
373 const class loop *loop_nest)
375 vec<tree> fns = DR_ACCESS_FNS (a);
377 for (tree t : fns)
378 if (!evolution_function_is_invariant_p (t, loop_nest->num))
379 return true;
381 return false;
384 /* Returns true if the distance in DDR can be determined and adjusts
385 the unroll factor in *UNROLL to make unrolling valid for that distance.
386 Otherwise return false. DDR is with respect to the outer loop of INNER.
388 If this data dep can lead to a removed memory reference, increment
389 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
390 for this to happen. */
392 static bool
393 adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
394 unsigned *unroll, unsigned *profit_unroll,
395 unsigned *removed)
397 bool ret = false;
398 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
400 if (DDR_NUM_DIST_VECTS (ddr) == 0)
401 return false;
402 unsigned i;
403 lambda_vector dist_v;
404 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
406 /* A distance (a,b) is at worst transformed into (a/N,b) by the
407 unrolling (factor N), so the transformation is valid if
408 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
409 factor needs to be limited so that the first condition holds.
410 That may limit the factor down to zero in the worst case. */
411 lambda_int dist = dist_v[0];
412 if (dist < 0)
413 gcc_unreachable ();
414 else if (dist >= (lambda_int)*unroll)
416 else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
418 /* We have (a,0) with a < N, so this will be transformed into
419 (0,0) after unrolling by N. This might potentially be a
420 problem, if it's not a read-read dependency. */
421 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
423 else
425 /* So, at least one is a write, and we might reduce the
426 distance vector to (0,0). This is still no problem
427 if both data-refs are affine with respect to the inner
428 loops. But if one of them is invariant with respect
429 to an inner loop our reordering implicit in loop fusion
430 corrupts the program, as our data dependences don't
431 capture this. E.g. for:
432 for (0 <= i < n)
433 for (0 <= j < m)
434 a[i][0] = a[i+1][0] + 2; // (1)
435 b[i][j] = b[i+1][j] + 2; // (2)
436 the distance vector for both statements is (-1,0),
437 but exchanging the order for (2) is okay, while
438 for (1) it is not. To see this, write out the original
439 accesses (assume m is 2):
440 a i j original
441 0 0 0 r a[1][0] b[1][0]
442 1 0 0 w a[0][0] b[0][0]
443 2 0 1 r a[1][0] b[1][1]
444 3 0 1 w a[0][0] b[0][1]
445 4 1 0 r a[2][0] b[2][0]
446 5 1 0 w a[1][0] b[1][0]
447 after unroll-by-2 and fusion the accesses are done in
448 this order (from column a): 0,1, 4,5, 2,3, i.e. this:
449 a i j transformed
450 0 0 0 r a[1][0] b[1][0]
451 1 0 0 w a[0][0] b[0][0]
452 4 1 0 r a[2][0] b[2][0]
453 5 1 0 w a[1][0] b[1][0]
454 2 0 1 r a[1][0] b[1][1]
455 3 0 1 w a[0][0] b[0][1]
456 Note how access 2 accesses the same element as access 5
457 for array 'a' but not for array 'b'. */
458 if (any_access_function_variant_p (DDR_A (ddr), inner)
459 && any_access_function_variant_p (DDR_B (ddr), inner))
461 else
462 /* And if any dataref of this pair is invariant with
463 respect to the inner loop, we have no chance than
464 to reduce the unroll factor. */
465 *unroll = dist;
468 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
470 else
471 *unroll = dist;
473 /* With a distance (a,0) it's always profitable to unroll-and-jam
474 (by a+1), because one memory reference will go away. With
475 (a,b) and b != 0 that's less clear. We will increase the
476 number of streams without lowering the number of mem refs.
477 So for now only handle the first situation. */
478 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
480 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
481 (*removed)++;
484 ret = true;
487 return ret;
490 /* Main entry point for the unroll-and-jam transformation
491 described above. */
493 static unsigned int
494 tree_loop_unroll_and_jam (void)
496 unsigned int todo = 0;
498 gcc_assert (scev_initialized_p ());
500 /* Go through all innermost loops. */
501 for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
503 class loop *outer = loop_outer (loop);
505 if (loop_depth (loop) < 2
506 || optimize_loop_nest_for_size_p (outer))
507 continue;
509 if (!unroll_jam_possible_p (outer, loop))
510 continue;
512 vec<data_reference_p> datarefs = vNULL;
513 vec<ddr_p> dependences = vNULL;
514 unsigned unroll_factor, profit_unroll, removed;
515 class tree_niter_desc desc;
516 bool unroll = false;
518 auto_vec<loop_p, 3> loop_nest;
519 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
520 &datarefs, &dependences))
522 if (dump_file && (dump_flags & TDF_DETAILS))
523 fprintf (dump_file, "Cannot analyze data dependencies\n");
524 free_data_refs (datarefs);
525 free_dependence_relations (dependences);
526 continue;
528 if (!datarefs.length ())
529 continue;
531 if (dump_file && (dump_flags & TDF_DETAILS))
532 dump_data_dependence_relations (dump_file, dependences);
534 unroll_factor = (unsigned)-1;
535 profit_unroll = 1;
536 removed = 0;
538 /* Check all dependencies. */
539 unsigned i;
540 struct data_dependence_relation *ddr;
541 FOR_EACH_VEC_ELT (dependences, i, ddr)
543 struct data_reference *dra, *drb;
545 /* If the refs are independend there's nothing to do. */
546 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
547 continue;
548 dra = DDR_A (ddr);
549 drb = DDR_B (ddr);
550 /* Nothing interesting for the self dependencies. */
551 if (dra == drb)
552 continue;
554 /* Now check the distance vector, for determining a sensible
555 outer unroll factor, and for validity of merging the inner
556 loop copies. */
557 if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
558 &removed))
560 /* Couldn't get the distance vector. For two reads that's
561 harmless (we assume we should unroll). For at least
562 one write this means we can't check the dependence direction
563 and hence can't determine safety. */
565 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
567 unroll_factor = 0;
568 break;
573 /* We regard a user-specified minimum percentage of zero as a request
574 to ignore all profitability concerns and apply the transformation
575 always. */
576 if (!param_unroll_jam_min_percent)
577 profit_unroll = MAX(2, profit_unroll);
578 else if (removed * 100 / datarefs.length ()
579 < (unsigned)param_unroll_jam_min_percent)
580 profit_unroll = 1;
581 if (unroll_factor > profit_unroll)
582 unroll_factor = profit_unroll;
583 if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
584 unroll_factor = param_unroll_jam_max_unroll;
585 unroll = (unroll_factor > 1
586 && can_unroll_loop_p (outer, unroll_factor, &desc));
588 if (unroll)
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
592 find_loop_location (outer),
593 "applying unroll and jam with factor %d\n",
594 unroll_factor);
595 initialize_original_copy_tables ();
596 tree_unroll_loop (outer, unroll_factor, &desc);
597 free_original_copy_tables ();
598 fuse_loops (outer->inner);
599 todo |= TODO_cleanup_cfg;
601 auto_bitmap exit_bbs;
602 bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
603 todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
606 loop_nest.release ();
607 free_dependence_relations (dependences);
608 free_data_refs (datarefs);
611 if (todo)
613 scev_reset ();
614 free_dominance_info (CDI_DOMINATORS);
616 return todo;
619 /* Pass boilerplate */
621 namespace {
623 const pass_data pass_data_loop_jam =
625 GIMPLE_PASS, /* type */
626 "unrolljam", /* name */
627 OPTGROUP_LOOP, /* optinfo_flags */
628 TV_LOOP_JAM, /* tv_id */
629 PROP_cfg, /* properties_required */
630 0, /* properties_provided */
631 0, /* properties_destroyed */
632 0, /* todo_flags_start */
633 0, /* todo_flags_finish */
636 class pass_loop_jam : public gimple_opt_pass
638 public:
639 pass_loop_jam (gcc::context *ctxt)
640 : gimple_opt_pass (pass_data_loop_jam, ctxt)
643 /* opt_pass methods: */
644 virtual bool gate (function *) { return flag_unroll_jam != 0; }
645 virtual unsigned int execute (function *);
649 unsigned int
650 pass_loop_jam::execute (function *fun)
652 if (number_of_loops (fun) <= 1)
653 return 0;
655 return tree_loop_unroll_and_jam ();
660 gimple_opt_pass *
661 make_pass_loop_jam (gcc::context *ctxt)
663 return new pass_loop_jam (ctxt);