1 /* Loop unroll-and-jam.
2 Copyright (C) 2017-2021 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
23 #include "tree-pass.h"
28 #include "fold-const.h"
31 #include "tree-ssa-loop-niter.h"
32 #include "tree-ssa-loop.h"
33 #include "tree-ssa-loop-manip.h"
35 #include "tree-scalar-evolution.h"
36 #include "gimple-iterator.h"
38 #include "tree-data-ref.h"
39 #include "tree-ssa-loop-ivopts.h"
40 #include "tree-vectorizer.h"
41 #include "tree-ssa-sccvn.h"
43 /* Unroll and Jam transformation
45 This is a combination of two transformations, where the second
46 is not always valid. It's applicable if a loop nest has redundancies
47 over the iterations of an outer loop while not having that with
67 then fuse the two adjacent inner loops resulting from that:
75 As the order of evaluations of the body B changes this is valid
76 only in certain situations: all distance vectors need to be forward.
77 Additionally if there are multiple induction variables than just
78 a counting control IV (j above) we can also deal with some situations.
80 The validity is checked by unroll_jam_possible_p, and the data-dep
83 A trivial example where the fusion is wrong would be when
84 B(i,j) == x[j-1] = x[j];
92 } effect: move content to front by two elements
99 } effect: move content to front by one element
102 /* Modify the loop tree for the fact that all code once belonging
103 to the OLD loop or the outer loop of OLD now is inside LOOP. */
106 merge_loop_tree (class loop
*loop
, class loop
*old
)
114 /* Find its nodes. */
115 bbs
= XNEWVEC (basic_block
, n_basic_blocks_for_fn (cfun
));
116 n
= get_loop_body_with_size (loop
, bbs
, n_basic_blocks_for_fn (cfun
));
118 for (i
= 0; i
< n
; i
++)
120 /* If the block was direct child of OLD loop it's now part
121 of LOOP. If it was outside OLD, then it moved into LOOP
122 as well. This avoids changing the loop father for BBs
123 in inner loops of OLD. */
124 if (bbs
[i
]->loop_father
== old
125 || loop_depth (bbs
[i
]->loop_father
) < loop_depth (old
))
127 remove_bb_from_loops (bbs
[i
]);
128 add_bb_to_loop (bbs
[i
], loop
);
132 /* If we find a direct subloop of OLD, move it to LOOP. */
133 subloop
= bbs
[i
]->loop_father
;
134 if (loop_outer (subloop
) == old
&& subloop
->header
== bbs
[i
])
136 flow_loop_tree_node_remove (subloop
);
137 flow_loop_tree_node_add (loop
, subloop
);
141 /* Update the information about loop exit edges. */
142 for (i
= 0; i
< n
; i
++)
144 FOR_EACH_EDGE (e
, ei
, bbs
[i
]->succs
)
146 rescan_loop_exit (e
, false, false);
155 /* BB is part of the outer loop of an unroll-and-jam situation.
156 Check if any statements therein would prevent the transformation. */
159 bb_prevents_fusion_p (basic_block bb
)
161 gimple_stmt_iterator gsi
;
162 /* BB is duplicated by outer unrolling and then all N-1 first copies
163 move into the body of the fused inner loop. If BB exits the outer loop
164 the last copy still does so, and the first N-1 copies are cancelled
165 by loop unrolling, so also after fusion it's the exit block.
166 But there might be other reasons that prevent fusion:
167 * stores or unknown side-effects prevent fusion
169 * computations into SSA names: these aren't problematic. Their
170 result will be unused on the exit edges of the first N-1 copies
171 (those aren't taken after unrolling). If they are used on the
172 other edge (the one leading to the outer latch block) they are
173 loop-carried (on the outer loop) and the Nth copy of BB will
174 compute them again (i.e. the first N-1 copies will be dead). */
175 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
177 gimple
*g
= gsi_stmt (gsi
);
178 if (gimple_vdef (g
) || gimple_has_side_effects (g
))
184 /* Given an inner loop LOOP (of some OUTER loop) determine if
185 we can safely fuse copies of it (generated by outer unrolling).
186 If so return true, otherwise return false. */
189 unroll_jam_possible_p (class loop
*outer
, class loop
*loop
)
193 class tree_niter_desc niter
;
195 /* When fusing the loops we skip the latch block
196 of the first one, so it mustn't have any effects to
198 if (!empty_block_p (loop
->latch
))
201 if (!single_exit (loop
))
204 /* We need a perfect nest. Quick check for adjacent inner loops. */
205 if (outer
->inner
!= loop
|| loop
->next
)
208 /* Prevent head-controlled inner loops, that we usually have.
209 The guard block would need to be accepted
210 (invariant condition either entering or skipping the loop),
211 without also accepting arbitrary control flow. When unswitching
212 ran before us (as with -O3) this won't be a problem because its
213 outer loop unswitching will have moved out the invariant condition.
215 If we do that we need to extend fuse_loops() to cope with this
216 by threading through the (still invariant) copied condition
217 between the two loop copies. */
218 if (!dominated_by_p (CDI_DOMINATORS
, outer
->latch
, loop
->header
))
221 /* The number of iterations of the inner loop must be loop invariant
222 with respect to the outer loop. */
223 if (!number_of_iterations_exit (loop
, single_exit (loop
), &niter
,
225 || niter
.cmp
== ERROR_MARK
226 || !integer_zerop (niter
.may_be_zero
)
227 || !expr_invariant_in_loop_p (outer
, niter
.niter
))
230 /* If the inner loop produces any values that are used inside the
231 outer loop (except the virtual op) then it can flow
232 back (perhaps indirectly) into the inner loop. This prevents
233 fusion: without fusion the value at the last iteration is used,
234 with fusion the value after the initial iteration is used.
236 If all uses are outside the outer loop this doesn't prevent fusion;
237 the value of the last iteration is still used (and the values from
238 all intermediate iterations are dead). */
240 for (psi
= gsi_start_phis (single_exit (loop
)->dest
);
241 !gsi_end_p (psi
); gsi_next (&psi
))
243 imm_use_iterator imm_iter
;
245 tree op
= gimple_phi_result (psi
.phi ());
246 if (virtual_operand_p (op
))
248 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, op
)
250 gimple
*use_stmt
= USE_STMT (use_p
);
251 if (!is_gimple_debug (use_stmt
)
252 && flow_bb_inside_loop_p (outer
, gimple_bb (use_stmt
)))
257 /* And check blocks belonging to just outer loop. */
258 bbs
= XNEWVEC (basic_block
, n_basic_blocks_for_fn (cfun
));
259 n
= get_loop_body_with_size (outer
, bbs
, n_basic_blocks_for_fn (cfun
));
261 for (i
= 0; i
< n
; i
++)
262 if (bbs
[i
]->loop_father
== outer
&& bb_prevents_fusion_p (bbs
[i
]))
268 /* For now we can safely fuse copies of LOOP only if all
269 loop carried variables are inductions (or the virtual op).
271 We could handle reductions as well (the initial value in the second
272 body would be the after-iter value of the first body) if it's over
273 an associative and commutative operation. We wouldn't
274 be able to handle unknown cycles. */
275 for (psi
= gsi_start_phis (loop
->header
); !gsi_end_p (psi
); gsi_next (&psi
))
278 tree op
= gimple_phi_result (psi
.phi ());
280 if (virtual_operand_p (op
))
282 if (!simple_iv (loop
, loop
, op
, &iv
, true))
284 /* The inductions must be regular, loop invariant step and initial
286 if (!expr_invariant_in_loop_p (outer
, iv
.step
)
287 || !expr_invariant_in_loop_p (outer
, iv
.base
))
289 /* XXX With more effort we could also be able to deal with inductions
290 where the initial value is loop variant but a simple IV in the
291 outer loop. The initial value for the second body would be
292 the original initial value plus iv.base.step. The next value
293 for the fused loop would be the original next value of the first
294 copy, _not_ the next value of the second body. */
300 /* Fuse LOOP with all further neighbors. The loops are expected to
301 be in appropriate form. */
304 fuse_loops (class loop
*loop
)
306 class loop
*next
= loop
->next
;
312 remove_branch (single_pred_edge (loop
->latch
));
313 /* Make delete_basic_block not fiddle with the loop structure. */
314 basic_block oldlatch
= loop
->latch
;
316 delete_basic_block (oldlatch
);
317 e
= redirect_edge_and_branch (loop_latch_edge (next
),
319 loop
->latch
= e
->src
;
320 flush_pending_stmts (e
);
322 gcc_assert (EDGE_COUNT (next
->header
->preds
) == 1);
324 /* The PHI nodes of the second body (single-argument now)
325 need adjustments to use the right values: either directly
326 the value of the corresponding PHI in the first copy or
327 the one leaving the first body which unrolling did for us.
329 See also unroll_jam_possible_p() for further possibilities. */
330 gphi_iterator psi_first
, psi_second
;
331 e
= single_pred_edge (next
->header
);
332 for (psi_first
= gsi_start_phis (loop
->header
),
333 psi_second
= gsi_start_phis (next
->header
);
334 !gsi_end_p (psi_first
);
335 gsi_next (&psi_first
), gsi_next (&psi_second
))
337 gphi
*phi_first
= psi_first
.phi ();
338 gphi
*phi_second
= psi_second
.phi ();
339 tree firstop
= gimple_phi_result (phi_first
);
340 /* The virtual operand is correct already as it's
341 always live at exit, hence has a LCSSA node and outer
342 loop unrolling updated SSA form. */
343 if (virtual_operand_p (firstop
))
346 /* Due to unroll_jam_possible_p() we know that this is
347 an induction. The second body goes over the same
349 add_phi_arg (phi_second
, firstop
, e
,
350 gimple_location (phi_first
));
352 gcc_assert (gsi_end_p (psi_second
));
354 merge_loop_tree (loop
, next
);
355 gcc_assert (!next
->num_nodes
);
356 class loop
*ln
= next
->next
;
360 rewrite_into_loop_closed_ssa_1 (NULL
, 0, SSA_OP_USE
, loop
);
363 /* Return true if any of the access functions for dataref A
364 isn't invariant with respect to loop LOOP_NEST. */
366 any_access_function_variant_p (const struct data_reference
*a
,
367 const class loop
*loop_nest
)
369 vec
<tree
> fns
= DR_ACCESS_FNS (a
);
372 if (!evolution_function_is_invariant_p (t
, loop_nest
->num
))
378 /* Returns true if the distance in DDR can be determined and adjusts
379 the unroll factor in *UNROLL to make unrolling valid for that distance.
380 Otherwise return false. DDR is with respect to the outer loop of INNER.
382 If this data dep can lead to a removed memory reference, increment
383 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
384 for this to happen. */
387 adjust_unroll_factor (class loop
*inner
, struct data_dependence_relation
*ddr
,
388 unsigned *unroll
, unsigned *profit_unroll
,
392 if (DDR_ARE_DEPENDENT (ddr
) != chrec_known
)
394 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
397 lambda_vector dist_v
;
398 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
400 /* A distance (a,b) is at worst transformed into (a/N,b) by the
401 unrolling (factor N), so the transformation is valid if
402 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
403 factor needs to be limited so that the first condition holds.
404 That may limit the factor down to zero in the worst case. */
405 int dist
= dist_v
[0];
408 else if ((unsigned)dist
>= *unroll
)
410 else if (lambda_vector_zerop (dist_v
+ 1, DDR_NB_LOOPS (ddr
) - 1))
412 /* We have (a,0) with a < N, so this will be transformed into
413 (0,0) after unrolling by N. This might potentially be a
414 problem, if it's not a read-read dependency. */
415 if (DR_IS_READ (DDR_A (ddr
)) && DR_IS_READ (DDR_B (ddr
)))
419 /* So, at least one is a write, and we might reduce the
420 distance vector to (0,0). This is still no problem
421 if both data-refs are affine with respect to the inner
422 loops. But if one of them is invariant with respect
423 to an inner loop our reordering implicit in loop fusion
424 corrupts the program, as our data dependences don't
425 capture this. E.g. for:
428 a[i][0] = a[i+1][0] + 2; // (1)
429 b[i][j] = b[i+1][j] + 2; // (2)
430 the distance vector for both statements is (-1,0),
431 but exchanging the order for (2) is okay, while
432 for (1) it is not. To see this, write out the original
433 accesses (assume m is 2):
435 0 0 0 r a[1][0] b[1][0]
436 1 0 0 w a[0][0] b[0][0]
437 2 0 1 r a[1][0] b[1][1]
438 3 0 1 w a[0][0] b[0][1]
439 4 1 0 r a[2][0] b[2][0]
440 5 1 0 w a[1][0] b[1][0]
441 after unroll-by-2 and fusion the accesses are done in
442 this order (from column a): 0,1, 4,5, 2,3, i.e. this:
444 0 0 0 r a[1][0] b[1][0]
445 1 0 0 w a[0][0] b[0][0]
446 4 1 0 r a[2][0] b[2][0]
447 5 1 0 w a[1][0] b[1][0]
448 2 0 1 r a[1][0] b[1][1]
449 3 0 1 w a[0][0] b[0][1]
450 Note how access 2 accesses the same element as access 5
451 for array 'a' but not for array 'b'. */
452 if (any_access_function_variant_p (DDR_A (ddr
), inner
)
453 && any_access_function_variant_p (DDR_B (ddr
), inner
))
456 /* And if any dataref of this pair is invariant with
457 respect to the inner loop, we have no chance than
458 to reduce the unroll factor. */
462 else if (lambda_vector_lexico_pos (dist_v
+ 1, DDR_NB_LOOPS (ddr
) - 1))
467 /* With a distance (a,0) it's always profitable to unroll-and-jam
468 (by a+1), because one memory reference will go away. With
469 (a,b) and b != 0 that's less clear. We will increase the
470 number of streams without lowering the number of mem refs.
471 So for now only handle the first situation. */
472 if (lambda_vector_zerop (dist_v
+ 1, DDR_NB_LOOPS (ddr
) - 1))
474 *profit_unroll
= MAX (*profit_unroll
, (unsigned)dist
+ 1);
484 /* Main entry point for the unroll-and-jam transformation
488 tree_loop_unroll_and_jam (void)
490 unsigned int todo
= 0;
492 gcc_assert (scev_initialized_p ());
494 /* Go through all innermost loops. */
495 for (auto loop
: loops_list (cfun
, LI_ONLY_INNERMOST
))
497 class loop
*outer
= loop_outer (loop
);
499 if (loop_depth (loop
) < 2
500 || optimize_loop_nest_for_size_p (outer
))
503 if (!unroll_jam_possible_p (outer
, loop
))
506 vec
<data_reference_p
> datarefs
= vNULL
;
507 vec
<ddr_p
> dependences
= vNULL
;
508 unsigned unroll_factor
, profit_unroll
, removed
;
509 class tree_niter_desc desc
;
512 auto_vec
<loop_p
, 3> loop_nest
;
513 if (!compute_data_dependences_for_loop (outer
, true, &loop_nest
,
514 &datarefs
, &dependences
))
516 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
517 fprintf (dump_file
, "Cannot analyze data dependencies\n");
518 free_data_refs (datarefs
);
519 free_dependence_relations (dependences
);
522 if (!datarefs
.length ())
525 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
526 dump_data_dependence_relations (dump_file
, dependences
);
528 unroll_factor
= (unsigned)-1;
532 /* Check all dependencies. */
534 struct data_dependence_relation
*ddr
;
535 FOR_EACH_VEC_ELT (dependences
, i
, ddr
)
537 struct data_reference
*dra
, *drb
;
539 /* If the refs are independend there's nothing to do. */
540 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
544 /* Nothing interesting for the self dependencies. */
548 /* Now check the distance vector, for determining a sensible
549 outer unroll factor, and for validity of merging the inner
551 if (!adjust_unroll_factor (loop
, ddr
, &unroll_factor
, &profit_unroll
,
554 /* Couldn't get the distance vector. For two reads that's
555 harmless (we assume we should unroll). For at least
556 one write this means we can't check the dependence direction
557 and hence can't determine safety. */
559 if (DR_IS_WRITE (dra
) || DR_IS_WRITE (drb
))
567 /* We regard a user-specified minimum percentage of zero as a request
568 to ignore all profitability concerns and apply the transformation
570 if (!param_unroll_jam_min_percent
)
571 profit_unroll
= MAX(2, profit_unroll
);
572 else if (removed
* 100 / datarefs
.length ()
573 < (unsigned)param_unroll_jam_min_percent
)
575 if (unroll_factor
> profit_unroll
)
576 unroll_factor
= profit_unroll
;
577 if (unroll_factor
> (unsigned)param_unroll_jam_max_unroll
)
578 unroll_factor
= param_unroll_jam_max_unroll
;
579 unroll
= (unroll_factor
> 1
580 && can_unroll_loop_p (outer
, unroll_factor
, &desc
));
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
| TDF_DETAILS
,
586 find_loop_location (outer
),
587 "applying unroll and jam with factor %d\n",
589 initialize_original_copy_tables ();
590 tree_unroll_loop (outer
, unroll_factor
, &desc
);
591 free_original_copy_tables ();
592 fuse_loops (outer
->inner
);
593 todo
|= TODO_cleanup_cfg
;
595 auto_bitmap exit_bbs
;
596 bitmap_set_bit (exit_bbs
, single_dom_exit (outer
)->dest
->index
);
597 todo
|= do_rpo_vn (cfun
, loop_preheader_edge (outer
), exit_bbs
);
600 loop_nest
.release ();
601 free_dependence_relations (dependences
);
602 free_data_refs (datarefs
);
608 free_dominance_info (CDI_DOMINATORS
);
613 /* Pass boilerplate */
617 const pass_data pass_data_loop_jam
=
619 GIMPLE_PASS
, /* type */
620 "unrolljam", /* name */
621 OPTGROUP_LOOP
, /* optinfo_flags */
622 TV_LOOP_JAM
, /* tv_id */
623 PROP_cfg
, /* properties_required */
624 0, /* properties_provided */
625 0, /* properties_destroyed */
626 0, /* todo_flags_start */
627 0, /* todo_flags_finish */
630 class pass_loop_jam
: public gimple_opt_pass
633 pass_loop_jam (gcc::context
*ctxt
)
634 : gimple_opt_pass (pass_data_loop_jam
, ctxt
)
637 /* opt_pass methods: */
638 virtual bool gate (function
*) { return flag_unroll_jam
!= 0; }
639 virtual unsigned int execute (function
*);
644 pass_loop_jam::execute (function
*fun
)
646 if (number_of_loops (fun
) <= 1)
649 return tree_loop_unroll_and_jam ();
655 make_pass_loop_jam (gcc::context
*ctxt
)
657 return new pass_loop_jam (ctxt
);