2 Copyright (C) 2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 3, or (at your option) any
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
28 #include "tree-pass.h"
30 #include "gimple-pretty-print.h"
31 #include "fold-const.h"
33 #include "gimple-iterator.h"
34 #include "gimplify-me.h"
38 #include "tree-scalar-evolution.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop-niter.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-dce.h"
43 #include "tree-data-ref.h"
44 #include "tree-vectorizer.h"
46 /* This pass performs loop interchange: for example, the loop nest
48 for (int j = 0; j < N; j++)
49 for (int k = 0; k < N; k++)
50 for (int i = 0; i < N; i++)
51 c[i][j] = c[i][j] + a[i][k]*b[k][j];
55 for (int i = 0; i < N; i++)
56 for (int j = 0; j < N; j++)
57 for (int k = 0; k < N; k++)
58 c[i][j] = c[i][j] + a[i][k]*b[k][j];
60 This pass implements loop interchange in the following steps:
62 1) Find perfect loop nest for each innermost loop and compute data
63 dependence relations for it. For above example, loop nest is
64 <loop_j, loop_k, loop_i>.
65 2) From innermost to outermost loop, this pass tries to interchange
66 each loop pair. For above case, it firstly tries to interchange
67 <loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
68 Then it tries to interchange <loop_j, loop_i> and loop nest becomes
69 <loop_i, loop_j, loop_k>. The overall effect is to move innermost
70 loop to the outermost position. For loop pair <loop_i, loop_j>
71 to be interchanged, we:
72 3) Check if data dependence relations are valid for loop interchange.
73 4) Check if both loops can be interchanged in terms of transformation.
74 5) Check if interchanging the two loops is profitable.
75 6) Interchange the two loops by mapping induction variables.
77 This pass also handles reductions in loop nest. So far we only support
78 simple reduction of inner loop and double reduction of the loop nest. */
80 /* Maximum number of stmts in each loop that should be interchanged. */
81 #define MAX_NUM_STMT (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS))
82 /* Maximum number of data references in loop nest. */
83 #define MAX_DATAREFS (PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
85 /* Comparison ratio of access stride between inner/outer loops to be
86 interchanged. This is the minimum stride ratio for loop interchange
88 #define OUTER_STRIDE_RATIO (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_STRIDE_RATIO))
89 /* The same as above, but we require higher ratio for interchanging the
90 innermost two loops. */
91 #define INNER_STRIDE_RATIO ((OUTER_STRIDE_RATIO) + 1)
93 /* Vector of strides that DR accesses in each level loop of a loop nest. */
94 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
96 /* Structure recording loop induction variable. */
97 typedef struct induction
101 /* IV's initializing value, which is the init arg of the IV PHI node. */
103 /* IV's initializing expr, which is (the expanded result of) init_val. */
109 /* Enum type for loop reduction variable. */
117 /* Structure recording loop reduction variable. */
118 typedef struct reduction
120 /* Reduction itself. */
122 /* PHI node defining reduction variable. */
124 /* Init and next variables of the reduction. */
127 /* Lcssa PHI node if reduction is used outside of its definition loop. */
129 /* Stmts defining init and next. */
132 /* If init is loaded from memory, this is the loading memory reference. */
134 /* If reduction is finally stored to memory, this is the stored memory
137 enum reduction_type type
;
141 /* Dump reduction RE. */
144 dump_reduction (reduction_p re
)
146 if (re
->type
== SIMPLE_RTYPE
)
147 fprintf (dump_file
, " Simple reduction: ");
148 else if (re
->type
== DOUBLE_RTYPE
)
149 fprintf (dump_file
, " Double reduction: ");
151 fprintf (dump_file
, " Unknown reduction: ");
153 print_gimple_stmt (dump_file
, re
->phi
, 0);
156 /* Dump LOOP's induction IV. */
158 dump_induction (struct loop
*loop
, induction_p iv
)
160 fprintf (dump_file
, " Induction: ");
161 print_generic_expr (dump_file
, iv
->var
, TDF_SLIM
);
162 fprintf (dump_file
, " = {");
163 print_generic_expr (dump_file
, iv
->init_expr
, TDF_SLIM
);
164 fprintf (dump_file
, ", ");
165 print_generic_expr (dump_file
, iv
->step
, TDF_SLIM
);
166 fprintf (dump_file
, "}_%d\n", loop
->num
);
169 /* Loop candidate for interchange. */
173 loop_cand (struct loop
*, struct loop
*);
176 reduction_p
find_reduction_by_stmt (gimple
*);
177 void classify_simple_reduction (reduction_p
);
178 bool analyze_iloop_reduction_var (tree
);
179 bool analyze_oloop_reduction_var (loop_cand
*, tree
);
180 bool analyze_induction_var (tree
, tree
);
181 bool analyze_carried_vars (loop_cand
*);
182 bool analyze_lcssa_phis (void);
183 bool can_interchange_p (loop_cand
*);
184 bool supported_operations (basic_block
, loop_cand
*, int *);
185 void undo_simple_reduction (reduction_p
, bitmap
);
187 /* The loop itself. */
189 /* The outer loop for interchange. It equals to loop if this loop cand
190 itself represents the outer loop. */
191 struct loop
*m_outer
;
192 /* Vector of induction variables in loop. */
193 vec
<induction_p
> m_inductions
;
194 /* Vector of reduction variables in loop. */
195 vec
<reduction_p
> m_reductions
;
196 /* Lcssa PHI nodes of this loop. */
197 vec
<gphi
*> m_lcssa_nodes
;
198 /* Single exit edge of this loop. */
200 /* Basic blocks of this loop. */
206 loop_cand::loop_cand (struct loop
*loop
, struct loop
*outer
)
207 : m_loop (loop
), m_outer (outer
),
208 m_exit (single_exit (loop
)), m_bbs (get_loop_body (loop
))
210 m_inductions
.create (3);
211 m_reductions
.create (3);
212 m_lcssa_nodes
.create (3);
217 loop_cand::~loop_cand ()
220 for (unsigned i
= 0; m_inductions
.iterate (i
, &iv
); ++i
)
224 for (unsigned i
= 0; m_reductions
.iterate (i
, &re
); ++i
)
227 m_inductions
.release ();
228 m_reductions
.release ();
229 m_lcssa_nodes
.release ();
233 /* Return single use stmt of VAR in LOOP, otherwise return NULL. */
236 single_use_in_loop (tree var
, struct loop
*loop
)
238 gimple
*stmt
, *res
= NULL
;
240 imm_use_iterator iterator
;
242 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, var
)
244 stmt
= USE_STMT (use_p
);
245 if (is_gimple_debug (stmt
))
248 if (!flow_bb_inside_loop_p (loop
, gimple_bb (stmt
)))
259 /* Return true if E is unsupported in loop interchange, i.e, E is a complex
260 edge or part of irreducible loop. */
263 unsupported_edge (edge e
)
265 return (e
->flags
& (EDGE_COMPLEX
| EDGE_IRREDUCIBLE_LOOP
));
268 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
272 loop_cand::find_reduction_by_stmt (gimple
*stmt
)
274 gphi
*phi
= dyn_cast
<gphi
*> (stmt
);
277 for (unsigned i
= 0; m_reductions
.iterate (i
, &re
); ++i
)
278 if ((phi
!= NULL
&& phi
== re
->lcssa_phi
)
279 || (stmt
== re
->producer
|| stmt
== re
->consumer
))
285 /* Return true if all stmts in BB can be supported by loop interchange,
286 otherwise return false. ILOOP is not NULL if this loop_cand is the
287 outer loop in loop nest. Add the number of supported statements to
291 loop_cand::supported_operations (basic_block bb
, loop_cand
*iloop
,
294 int bb_num_stmts
= 0;
296 gimple_stmt_iterator gsi
;
298 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
300 gimple
*stmt
= gsi_stmt (gsi
);
301 if (is_gimple_debug (stmt
))
304 if (gimple_has_side_effects (stmt
))
308 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
310 /* In basic block of outer loop, the call should be cheap since
311 it will be moved to inner loop. */
313 && !gimple_inexpensive_call_p (call
))
318 if (!iloop
|| !gimple_vuse (stmt
))
321 /* Support stmt accessing memory in outer loop only if it is for inner
323 if (iloop
->find_reduction_by_stmt (stmt
))
327 /* Support loop invariant memory reference if it's only used once by
329 /* ??? How's this checking for invariantness? */
330 if (gimple_assign_single_p (stmt
)
331 && (lhs
= gimple_assign_lhs (stmt
)) != NULL_TREE
332 && TREE_CODE (lhs
) == SSA_NAME
333 && single_use_in_loop (lhs
, iloop
->m_loop
))
338 *num_stmts
+= bb_num_stmts
;
340 /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
341 loop's header, or PHI nodes in dest bb of inner loop's exit edge. */
342 if (!iloop
|| bb
== m_loop
->header
343 || bb
== iloop
->m_exit
->dest
)
346 /* Don't allow any other PHI nodes. */
347 for (psi
= gsi_start_phis (bb
); !gsi_end_p (psi
); gsi_next (&psi
))
348 if (!virtual_operand_p (PHI_RESULT (psi
.phi ())))
354 /* Return true if current loop_cand be interchanged. ILOOP is not NULL if
355 current loop_cand is outer loop in loop nest. */
358 loop_cand::can_interchange_p (loop_cand
*iloop
)
360 /* For now we only support at most one reduction. */
361 unsigned allowed_reduction_num
= 1;
363 /* Only support reduction if the loop nest to be interchanged is the
364 innermostin two loops. */
365 if ((iloop
== NULL
&& m_loop
->inner
!= NULL
)
366 || (iloop
!= NULL
&& iloop
->m_loop
->inner
!= NULL
))
367 allowed_reduction_num
= 0;
369 if (m_reductions
.length () > allowed_reduction_num
370 || (m_reductions
.length () == 1
371 && m_reductions
[0]->type
== UNKNOWN_RTYPE
))
374 /* Only support lcssa PHI node which is for reduction. */
375 if (m_lcssa_nodes
.length () > allowed_reduction_num
)
379 /* Check basic blocks other than loop header/exit. */
380 for (unsigned i
= 0; i
< m_loop
->num_nodes
; i
++)
382 basic_block bb
= m_bbs
[i
];
384 /* Skip basic blocks of inner loops. */
385 if (bb
->loop_father
!= m_loop
)
388 /* Check if basic block has any unsupported operation. */
389 if (!supported_operations (bb
, iloop
, &num_stmts
))
392 /* Check if loop has too many stmts. */
393 if (num_stmts
> MAX_NUM_STMT
)
400 /* Programmers and optimizers (like loop store motion) may optimize code:
402 for (int i = 0; i < N; i++)
403 for (int j = 0; j < N; j++)
404 a[i] += b[j][i] * c[j][i];
408 for (int i = 0; i < N; i++)
410 // producer. Note sum can be intitialized to a constant.
412 for (int j = 0; j < N; j++)
414 sum += b[j][i] * c[j][i];
420 The result code can't be interchanged without undoing the optimization.
421 This function classifies this kind reduction and records information so
422 that we can undo the store motion during interchange. */
425 loop_cand::classify_simple_reduction (reduction_p re
)
427 gimple
*producer
, *consumer
;
429 /* Check init variable of reduction and how it is initialized. */
430 if (TREE_CODE (re
->init
) == SSA_NAME
)
432 producer
= SSA_NAME_DEF_STMT (re
->init
);
433 re
->producer
= producer
;
434 basic_block bb
= gimple_bb (producer
);
435 if (!bb
|| bb
->loop_father
!= m_outer
)
438 if (!gimple_assign_load_p (producer
))
441 re
->init_ref
= gimple_assign_rhs1 (producer
);
443 else if (!CONSTANT_CLASS_P (re
->init
))
446 /* Check how reduction variable is used. */
447 consumer
= single_use_in_loop (PHI_RESULT (re
->lcssa_phi
), m_outer
);
449 || !gimple_store_p (consumer
))
452 re
->fini_ref
= gimple_get_lhs (consumer
);
453 re
->consumer
= consumer
;
455 /* Simple reduction with constant initializer. */
458 gcc_assert (CONSTANT_CLASS_P (re
->init
));
459 re
->init_ref
= unshare_expr (re
->fini_ref
);
462 /* Require memory references in producer and consumer are the same so
463 that we can undo reduction during interchange. */
464 if (re
->init_ref
&& !operand_equal_p (re
->init_ref
, re
->fini_ref
, 0))
467 re
->type
= SIMPLE_RTYPE
;
470 /* Analyze reduction variable VAR for inner loop of the loop nest to be
471 interchanged. Return true if analysis succeeds. */
474 loop_cand::analyze_iloop_reduction_var (tree var
)
476 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (var
));
477 gphi
*lcssa_phi
= NULL
, *use_phi
;
478 tree init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (m_loop
));
479 tree next
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (m_loop
));
481 gimple
*stmt
, *next_def
, *single_use
= NULL
;
483 imm_use_iterator iterator
;
485 if (TREE_CODE (next
) != SSA_NAME
)
488 next_def
= SSA_NAME_DEF_STMT (next
);
489 basic_block bb
= gimple_bb (next_def
);
490 if (!bb
|| !flow_bb_inside_loop_p (m_loop
, bb
))
493 /* In restricted reduction, the var is (and must be) used in defining
494 the updated var. The process can be depicted as below:
496 var ;; = PHI<init, next>
500 +---------------------+
501 | reduction operators | <-- other operands
502 +---------------------+
508 In terms loop interchange, we don't change how NEXT is computed based
509 on VAR and OTHER OPERANDS. In case of double reduction in loop nest
510 to be interchanged, we don't changed it at all. In the case of simple
511 reduction in inner loop, we only make change how VAR/NEXT is loaded or
512 stored. With these conditions, we can relax restrictions on reduction
513 in a way that reduction operation is seen as black box. In general,
514 we can ignore reassociation of reduction operator; we can handle fake
515 reductions in which VAR is not even used to compute NEXT. */
516 if (! single_imm_use (var
, &use_p
, &single_use
)
517 || ! flow_bb_inside_loop_p (m_loop
, gimple_bb (single_use
)))
520 /* Check the reduction operation. We require a left-associative operation.
521 For FP math we also need to be allowed to associate operations. */
522 if (gassign
*ass
= dyn_cast
<gassign
*> (single_use
))
524 enum tree_code code
= gimple_assign_rhs_code (ass
);
525 if (! (associative_tree_code (code
)
526 || (code
== MINUS_EXPR
527 && use_p
->use
== gimple_assign_rhs1_ptr (ass
)))
528 || (FLOAT_TYPE_P (TREE_TYPE (var
))
529 && ! flag_associative_math
))
535 /* Handle and verify a series of stmts feeding the reduction op. */
536 if (single_use
!= next_def
537 && !check_reduction_path (UNKNOWN_LOCATION
, m_loop
, phi
, next
,
538 gimple_assign_rhs_code (single_use
)))
541 /* Only support cases in which INIT is used in inner loop. */
542 if (TREE_CODE (init
) == SSA_NAME
)
543 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, init
)
545 stmt
= USE_STMT (use_p
);
546 if (is_gimple_debug (stmt
))
549 if (!flow_bb_inside_loop_p (m_loop
, gimple_bb (stmt
)))
553 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, next
)
555 stmt
= USE_STMT (use_p
);
556 if (is_gimple_debug (stmt
))
559 /* Or else it's used in PHI itself. */
560 use_phi
= dyn_cast
<gphi
*> (stmt
);
566 && gimple_bb (stmt
) == m_exit
->dest
567 && PHI_ARG_DEF_FROM_EDGE (use_phi
, m_exit
) == next
)
575 re
= XCNEW (struct reduction
);
580 re
->lcssa_phi
= lcssa_phi
;
582 classify_simple_reduction (re
);
584 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
587 m_reductions
.safe_push (re
);
591 /* Analyze reduction variable VAR for outer loop of the loop nest to be
592 interchanged. ILOOP is not NULL and points to inner loop. For the
593 moment, we only support double reduction for outer loop, like:
595 for (int i = 0; i < n; i++)
599 for (int j = 0; j < n; j++) // outer loop
600 for (int k = 0; k < n; k++) // inner loop
601 sum += a[i][k]*b[k][j];
606 Note the innermost two loops are the loop nest to be interchanged.
607 Return true if analysis succeeds. */
610 loop_cand::analyze_oloop_reduction_var (loop_cand
*iloop
, tree var
)
612 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (var
));
613 gphi
*lcssa_phi
= NULL
, *use_phi
;
614 tree init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (m_loop
));
615 tree next
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (m_loop
));
617 gimple
*stmt
, *next_def
;
619 imm_use_iterator iterator
;
621 if (TREE_CODE (next
) != SSA_NAME
)
624 next_def
= SSA_NAME_DEF_STMT (next
);
625 basic_block bb
= gimple_bb (next_def
);
626 if (!bb
|| !flow_bb_inside_loop_p (m_loop
, bb
))
629 /* Find inner loop's simple reduction that uses var as initializer. */
630 reduction_p inner_re
= NULL
;
631 for (unsigned i
= 0; iloop
->m_reductions
.iterate (i
, &inner_re
); ++i
)
632 if (inner_re
->init
== var
|| operand_equal_p (inner_re
->init
, var
, 0))
636 || inner_re
->type
!= UNKNOWN_RTYPE
637 || inner_re
->producer
!= phi
)
640 /* In case of double reduction, outer loop's reduction should be updated
641 by inner loop's simple reduction. */
642 if (next_def
!= inner_re
->lcssa_phi
)
645 /* Outer loop's reduction should only be used to initialize inner loop's
647 if (! single_imm_use (var
, &use_p
, &stmt
)
648 || stmt
!= inner_re
->phi
)
651 /* Check this reduction is correctly used outside of loop via lcssa phi. */
652 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, next
)
654 stmt
= USE_STMT (use_p
);
655 if (is_gimple_debug (stmt
))
658 /* Or else it's used in PHI itself. */
659 use_phi
= dyn_cast
<gphi
*> (stmt
);
663 if (lcssa_phi
== NULL
665 && gimple_bb (stmt
) == m_exit
->dest
666 && PHI_ARG_DEF_FROM_EDGE (use_phi
, m_exit
) == next
)
674 re
= XCNEW (struct reduction
);
679 re
->lcssa_phi
= lcssa_phi
;
680 re
->type
= DOUBLE_RTYPE
;
681 inner_re
->type
= DOUBLE_RTYPE
;
683 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
686 m_reductions
.safe_push (re
);
690 /* Return true if VAR is induction variable of current loop whose scev is
691 specified by CHREC. */
694 loop_cand::analyze_induction_var (tree var
, tree chrec
)
696 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (var
));
697 tree init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (m_loop
));
699 /* Var is loop invariant, though it's unlikely to happen. */
700 if (tree_does_not_contain_chrecs (chrec
))
702 struct induction
*iv
= XCNEW (struct induction
);
705 iv
->init_expr
= chrec
;
706 iv
->step
= build_int_cst (TREE_TYPE (chrec
), 0);
707 m_inductions
.safe_push (iv
);
711 if (TREE_CODE (chrec
) != POLYNOMIAL_CHREC
712 || CHREC_VARIABLE (chrec
) != (unsigned) m_loop
->num
713 || tree_contains_chrecs (CHREC_LEFT (chrec
), NULL
)
714 || tree_contains_chrecs (CHREC_RIGHT (chrec
), NULL
))
717 struct induction
*iv
= XCNEW (struct induction
);
720 iv
->init_expr
= CHREC_LEFT (chrec
);
721 iv
->step
= CHREC_RIGHT (chrec
);
723 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
724 dump_induction (m_loop
, iv
);
726 m_inductions
.safe_push (iv
);
730 /* Return true if all loop carried variables defined in loop header can
731 be successfully analyzed. */
734 loop_cand::analyze_carried_vars (loop_cand
*iloop
)
736 edge e
= loop_preheader_edge (m_outer
);
739 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
740 fprintf (dump_file
, "\nLoop(%d) carried vars:\n", m_loop
->num
);
742 for (gsi
= gsi_start_phis (m_loop
->header
); !gsi_end_p (gsi
); gsi_next (&gsi
))
744 gphi
*phi
= gsi
.phi ();
746 tree var
= PHI_RESULT (phi
);
747 if (virtual_operand_p (var
))
750 tree chrec
= analyze_scalar_evolution (m_loop
, var
);
751 chrec
= instantiate_scev (e
, m_loop
, chrec
);
753 /* Analyze var as reduction variable. */
754 if (chrec_contains_undetermined (chrec
)
755 || chrec_contains_symbols_defined_in_loop (chrec
, m_outer
->num
))
757 if (iloop
&& !analyze_oloop_reduction_var (iloop
, var
))
759 if (!iloop
&& !analyze_iloop_reduction_var (var
))
762 /* Analyze var as induction variable. */
763 else if (!analyze_induction_var (var
, chrec
))
770 /* Return TRUE if loop closed PHI nodes can be analyzed successfully. */
773 loop_cand::analyze_lcssa_phis (void)
776 for (gsi
= gsi_start_phis (m_exit
->dest
); !gsi_end_p (gsi
); gsi_next (&gsi
))
778 gphi
*phi
= gsi
.phi ();
780 if (virtual_operand_p (PHI_RESULT (phi
)))
783 /* TODO: We only support lcssa phi for reduction for now. */
784 if (!find_reduction_by_stmt (phi
))
791 /* CONSUMER is a stmt in BB storing reduction result into memory object.
792 When the reduction is intialized from constant value, we need to add
793 a stmt loading from the memory object to target basic block in inner
794 loop during undoing the reduction. Problem is that memory reference
795 may use ssa variables not dominating the target basic block. This
796 function finds all stmts on which CONSUMER depends in basic block BB,
797 records and returns them via STMTS. */
800 find_deps_in_bb_for_stmt (gimple_seq
*stmts
, basic_block bb
, gimple
*consumer
)
802 auto_vec
<gimple
*, 4> worklist
;
805 gimple
*stmt
, *def_stmt
;
806 gimple_stmt_iterator gsi
;
808 /* First clear flag for stmts in bb. */
809 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
810 gimple_set_plf (gsi_stmt (gsi
), GF_PLF_1
, false);
812 /* DFS search all depended stmts in bb and mark flag for these stmts. */
813 worklist
.safe_push (consumer
);
814 while (!worklist
.is_empty ())
816 stmt
= worklist
.pop ();
817 FOR_EACH_SSA_USE_OPERAND (use_p
, stmt
, iter
, SSA_OP_USE
)
819 def_stmt
= SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p
));
821 if (is_a
<gphi
*> (def_stmt
)
822 || gimple_bb (def_stmt
) != bb
823 || gimple_plf (def_stmt
, GF_PLF_1
))
826 worklist
.safe_push (def_stmt
);
828 gimple_set_plf (stmt
, GF_PLF_1
, true);
830 for (gsi
= gsi_start_bb_nondebug (bb
);
831 !gsi_end_p (gsi
) && (stmt
= gsi_stmt (gsi
)) != consumer
;)
833 /* Move dep stmts to sequence STMTS. */
834 if (gimple_plf (stmt
, GF_PLF_1
))
836 gsi_remove (&gsi
, false);
837 gimple_seq_add_stmt_without_update (stmts
, stmt
);
840 gsi_next_nondebug (&gsi
);
844 /* User can write, optimizers can generate simple reduction RE for inner
845 loop. In order to make interchange valid, we have to undo reduction by
846 moving producer and consumer stmts into the inner loop. For example,
849 init = MEM_REF[idx]; //producer
851 var = phi<init, next>
853 reduc_sum = phi<next>
854 MEM_REF[idx] = reduc_sum //consumer
859 new_var = MEM_REF[idx]; //producer after moving
860 next = new_var op ...
861 MEM_REF[idx] = next; //consumer after moving
863 Note if the reduction variable is initialized to constant, like:
867 we compute new_var as below:
871 new_var = !first_iteration ? tmp : 0.0;
873 so that the initial const is used in the first iteration of loop. Also
874 record ssa variables for dead code elimination in DCE_SEEDS. */
877 loop_cand::undo_simple_reduction (reduction_p re
, bitmap dce_seeds
)
880 gimple_stmt_iterator from
, to
= gsi_after_labels (m_loop
->header
);
881 gimple_seq stmts
= NULL
;
884 /* Prepare the initialization stmts and insert it to inner loop. */
885 if (re
->producer
!= NULL
)
887 gimple_set_vuse (re
->producer
, NULL_TREE
);
888 from
= gsi_for_stmt (re
->producer
);
889 gsi_remove (&from
, false);
890 gimple_seq_add_stmt_without_update (&stmts
, re
->producer
);
895 /* Find all stmts on which expression "MEM_REF[idx]" depends. */
896 find_deps_in_bb_for_stmt (&stmts
, gimple_bb (re
->consumer
), re
->consumer
);
897 /* Because we generate new stmt loading from the MEM_REF to TMP. */
898 tree cond
, tmp
= copy_ssa_name (re
->var
);
899 stmt
= gimple_build_assign (tmp
, re
->init_ref
);
900 gimple_seq_add_stmt_without_update (&stmts
, stmt
);
902 /* Init new_var to MEM_REF or CONST depending on if it is the first
904 induction_p iv
= m_inductions
[0];
905 cond
= fold_build2 (NE_EXPR
, boolean_type_node
, iv
->var
, iv
->init_val
);
906 new_var
= copy_ssa_name (re
->var
);
907 stmt
= gimple_build_assign (new_var
, COND_EXPR
, cond
, tmp
, re
->init
);
908 gimple_seq_add_stmt_without_update (&stmts
, stmt
);
910 gsi_insert_seq_before (&to
, stmts
, GSI_SAME_STMT
);
912 /* Replace all uses of reduction var with new variable. */
914 imm_use_iterator iterator
;
915 FOR_EACH_IMM_USE_STMT (stmt
, iterator
, re
->var
)
917 FOR_EACH_IMM_USE_ON_STMT (use_p
, iterator
)
918 SET_USE (use_p
, new_var
);
923 /* Move consumer stmt into inner loop, just after reduction next's def. */
924 unlink_stmt_vdef (re
->consumer
);
925 release_ssa_name (gimple_vdef (re
->consumer
));
926 gimple_set_vdef (re
->consumer
, NULL_TREE
);
927 gimple_set_vuse (re
->consumer
, NULL_TREE
);
928 gimple_assign_set_rhs1 (re
->consumer
, re
->next
);
929 from
= gsi_for_stmt (re
->consumer
);
930 to
= gsi_for_stmt (SSA_NAME_DEF_STMT (re
->next
));
931 gsi_move_after (&from
, &to
);
933 /* Mark the reduction variables for DCE. */
934 bitmap_set_bit (dce_seeds
, SSA_NAME_VERSION (re
->var
));
935 bitmap_set_bit (dce_seeds
, SSA_NAME_VERSION (PHI_RESULT (re
->lcssa_phi
)));
938 /* Free DATAREFS and its auxiliary memory. */
941 free_data_refs_with_aux (vec
<data_reference_p
> datarefs
)
944 for (unsigned i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
947 DR_ACCESS_STRIDE (dr
)->release ();
951 free_data_refs (datarefs
);
954 /* Class for loop interchange transformation. */
956 class tree_loop_interchange
959 tree_loop_interchange (vec
<struct loop
*> loop_nest
)
960 : m_loop_nest (loop_nest
), m_niters_iv_var (NULL_TREE
),
961 m_dce_seeds (BITMAP_ALLOC (NULL
)) { }
962 ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds
); }
963 bool interchange (vec
<data_reference_p
>, vec
<ddr_p
>);
966 void update_data_info (unsigned, unsigned, vec
<data_reference_p
>, vec
<ddr_p
>);
967 bool valid_data_dependences (unsigned, unsigned, vec
<ddr_p
>);
968 void interchange_loops (loop_cand
&, loop_cand
&);
969 void map_inductions_to_loop (loop_cand
&, loop_cand
&);
970 void move_code_to_inner_loop (struct loop
*, struct loop
*, basic_block
*);
972 /* The whole loop nest in which interchange is ongoing. */
973 vec
<struct loop
*> m_loop_nest
;
974 /* We create new IV which is only used in loop's exit condition check.
975 In case of 3-level loop nest interchange, when we interchange the
976 innermost two loops, new IV created in the middle level loop does
977 not need to be preserved in interchanging the outermost two loops
978 later. We record the IV so that it can be skipped. */
979 tree m_niters_iv_var
;
980 /* Bitmap of seed variables for dead code elimination after interchange. */
984 /* Update data refs' access stride and dependence information after loop
985 interchange. I_IDX/O_IDX gives indices of interchanged loops in loop
986 nest. DATAREFS are data references. DDRS are data dependences. */
989 tree_loop_interchange::update_data_info (unsigned i_idx
, unsigned o_idx
,
990 vec
<data_reference_p
> datarefs
,
993 struct data_reference
*dr
;
994 struct data_dependence_relation
*ddr
;
996 /* Update strides of data references. */
997 for (unsigned i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
999 vec
<tree
> *stride
= DR_ACCESS_STRIDE (dr
);
1000 gcc_assert (stride
->length () > i_idx
);
1001 std::swap ((*stride
)[i_idx
], (*stride
)[o_idx
]);
1003 /* Update data dependences. */
1004 for (unsigned i
= 0; ddrs
.iterate (i
, &ddr
); ++i
)
1005 if (DDR_ARE_DEPENDENT (ddr
) != chrec_known
)
1007 for (unsigned j
= 0; j
< DDR_NUM_DIST_VECTS (ddr
); ++j
)
1009 lambda_vector dist_vect
= DDR_DIST_VECT (ddr
, j
);
1010 std::swap (dist_vect
[i_idx
], dist_vect
[o_idx
]);
1015 /* Check data dependence relations, return TRUE if it's valid to interchange
1016 two loops specified by I_IDX/O_IDX. Theoretically, interchanging the two
1017 loops is valid only if dist vector, after interchanging, doesn't have '>'
1018 as the leftmost non-'=' direction. Practically, this function have been
1019 conservative here by not checking some valid cases. */
1022 tree_loop_interchange::valid_data_dependences (unsigned i_idx
, unsigned o_idx
,
1025 struct data_dependence_relation
*ddr
;
1027 for (unsigned i
= 0; ddrs
.iterate (i
, &ddr
); ++i
)
1029 /* Skip no-dependence case. */
1030 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1033 for (unsigned j
= 0; j
< DDR_NUM_DIST_VECTS (ddr
); ++j
)
1035 lambda_vector dist_vect
= DDR_DIST_VECT (ddr
, j
);
1036 unsigned level
= dependence_level (dist_vect
, m_loop_nest
.length ());
1038 /* If there is no carried dependence. */
1044 /* If dependence is not carried by any loop in between the two
1045 loops [oloop, iloop] to interchange. */
1046 if (level
< o_idx
|| level
> i_idx
)
1049 /* Be conservative, skip case if either direction at i_idx/o_idx
1050 levels is not '=' or '<'. */
1051 if (dist_vect
[i_idx
] < 0 || dist_vect
[o_idx
] < 0)
1059 /* Interchange two loops specified by ILOOP and OLOOP. */
1062 tree_loop_interchange::interchange_loops (loop_cand
&iloop
, loop_cand
&oloop
)
1065 gimple_stmt_iterator gsi
;
1066 tree i_niters
, o_niters
, var_after
;
1068 /* Undo inner loop's simple reduction. */
1069 for (unsigned i
= 0; iloop
.m_reductions
.iterate (i
, &re
); ++i
)
1070 if (re
->type
!= DOUBLE_RTYPE
)
1073 reset_debug_uses (re
->producer
);
1075 iloop
.undo_simple_reduction (re
, m_dce_seeds
);
1078 /* Only need to reset debug uses for double reduction. */
1079 for (unsigned i
= 0; oloop
.m_reductions
.iterate (i
, &re
); ++i
)
1081 gcc_assert (re
->type
== DOUBLE_RTYPE
);
1082 reset_debug_uses (SSA_NAME_DEF_STMT (re
->var
));
1083 reset_debug_uses (SSA_NAME_DEF_STMT (re
->next
));
1086 /* Prepare niters for both loops. */
1087 struct loop
*loop_nest
= m_loop_nest
[0];
1088 edge instantiate_below
= loop_preheader_edge (loop_nest
);
1089 gsi
= gsi_last_bb (loop_preheader_edge (loop_nest
)->src
);
1090 i_niters
= number_of_latch_executions (iloop
.m_loop
);
1091 i_niters
= analyze_scalar_evolution (loop_outer (iloop
.m_loop
), i_niters
);
1092 i_niters
= instantiate_scev (instantiate_below
, loop_outer (iloop
.m_loop
),
1094 i_niters
= force_gimple_operand_gsi (&gsi
, unshare_expr (i_niters
), true,
1095 NULL_TREE
, false, GSI_CONTINUE_LINKING
);
1096 o_niters
= number_of_latch_executions (oloop
.m_loop
);
1097 if (oloop
.m_loop
!= loop_nest
)
1099 o_niters
= analyze_scalar_evolution (loop_outer (oloop
.m_loop
), o_niters
);
1100 o_niters
= instantiate_scev (instantiate_below
, loop_outer (oloop
.m_loop
),
1103 o_niters
= force_gimple_operand_gsi (&gsi
, unshare_expr (o_niters
), true,
1104 NULL_TREE
, false, GSI_CONTINUE_LINKING
);
1106 /* Move src's code to tgt loop. This is necessary when src is the outer
1107 loop and tgt is the inner loop. */
1108 move_code_to_inner_loop (oloop
.m_loop
, iloop
.m_loop
, oloop
.m_bbs
);
1110 /* Map outer loop's IV to inner loop, and vice versa. */
1111 map_inductions_to_loop (oloop
, iloop
);
1112 map_inductions_to_loop (iloop
, oloop
);
1114 /* Create canonical IV for both loops. Note canonical IV for outer/inner
1115 loop is actually from inner/outer loop. Also we record the new IV
1116 created for the outer loop so that it can be skipped in later loop
1118 create_canonical_iv (oloop
.m_loop
, oloop
.m_exit
,
1119 i_niters
, &m_niters_iv_var
, &var_after
);
1120 bitmap_set_bit (m_dce_seeds
, SSA_NAME_VERSION (var_after
));
1121 create_canonical_iv (iloop
.m_loop
, iloop
.m_exit
,
1122 o_niters
, NULL
, &var_after
);
1123 bitmap_set_bit (m_dce_seeds
, SSA_NAME_VERSION (var_after
));
1125 /* Scrap niters estimation of interchanged loops. */
1126 iloop
.m_loop
->any_upper_bound
= false;
1127 iloop
.m_loop
->any_likely_upper_bound
= false;
1128 free_numbers_of_iterations_estimates (iloop
.m_loop
);
1129 oloop
.m_loop
->any_upper_bound
= false;
1130 oloop
.m_loop
->any_likely_upper_bound
= false;
1131 free_numbers_of_iterations_estimates (oloop
.m_loop
);
1133 /* ??? The association between the loop data structure and the
1134 CFG changed, so what was loop N at the source level is now
1135 loop M. We should think of retaining the association or breaking
1136 it fully by creating a new loop instead of re-using the "wrong" one. */
1139 /* Map induction variables of SRC loop to TGT loop. The function firstly
1140 creates the same IV of SRC loop in TGT loop, then deletes the original
1141 IV and re-initialize it using the newly created IV. For example, loop
1144 for (i = 0; i < N; i++)
1145 for (j = 0; j < M; j++)
1151 will be transformed into:
1153 for (jj = 0; jj < M; jj++)
1154 for (ii = 0; ii < N; ii++)
1160 after loop interchange. */
1163 tree_loop_interchange::map_inductions_to_loop (loop_cand
&src
, loop_cand
&tgt
)
1166 edge e
= tgt
.m_exit
;
1167 gimple_stmt_iterator incr_pos
= gsi_last_bb (e
->src
), gsi
;
1169 /* Map source loop's IV to target loop. */
1170 for (unsigned i
= 0; src
.m_inductions
.iterate (i
, &iv
); ++i
)
1172 gimple
*use_stmt
, *stmt
= SSA_NAME_DEF_STMT (iv
->var
);
1173 gcc_assert (is_a
<gphi
*> (stmt
));
1175 use_operand_p use_p
;
1176 /* Only map original IV to target loop. */
1177 if (m_niters_iv_var
!= iv
->var
)
1179 /* Map the IV by creating the same one in target loop. */
1180 tree var_before
, var_after
;
1181 tree base
= unshare_expr (iv
->init_expr
);
1182 tree step
= unshare_expr (iv
->step
);
1183 create_iv (base
, step
, SSA_NAME_VAR (iv
->var
),
1184 tgt
.m_loop
, &incr_pos
, false, &var_before
, &var_after
);
1185 bitmap_set_bit (m_dce_seeds
, SSA_NAME_VERSION (var_before
));
1186 bitmap_set_bit (m_dce_seeds
, SSA_NAME_VERSION (var_after
));
1188 /* Replace uses of the original IV var with newly created IV var. */
1189 imm_use_iterator imm_iter
;
1190 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, iv
->var
)
1192 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
1193 SET_USE (use_p
, var_before
);
1195 update_stmt (use_stmt
);
1199 /* Mark all uses for DCE. */
1200 ssa_op_iter op_iter
;
1201 FOR_EACH_PHI_OR_STMT_USE (use_p
, stmt
, op_iter
, SSA_OP_USE
)
1203 tree use
= USE_FROM_PTR (use_p
);
1204 if (TREE_CODE (use
) == SSA_NAME
1205 && ! SSA_NAME_IS_DEFAULT_DEF (use
))
1206 bitmap_set_bit (m_dce_seeds
, SSA_NAME_VERSION (use
));
1209 /* Delete definition of the original IV in the source loop. */
1210 gsi
= gsi_for_stmt (stmt
);
1211 remove_phi_node (&gsi
, true);
1215 /* Move stmts of outer loop to inner loop. */
1218 tree_loop_interchange::move_code_to_inner_loop (struct loop
*outer
,
1220 basic_block
*outer_bbs
)
1222 basic_block oloop_exit_bb
= single_exit (outer
)->src
;
1223 gimple_stmt_iterator gsi
, to
;
1225 for (unsigned i
= 0; i
< outer
->num_nodes
; i
++)
1227 basic_block bb
= outer_bbs
[i
];
1229 /* Skip basic blocks of inner loop. */
1230 if (flow_bb_inside_loop_p (inner
, bb
))
1233 /* Move code from header/latch to header/latch. */
1234 if (bb
== outer
->header
)
1235 to
= gsi_after_labels (inner
->header
);
1236 else if (bb
== outer
->latch
)
1237 to
= gsi_after_labels (inner
->latch
);
1239 /* Otherwise, simply move to exit->src. */
1240 to
= gsi_last_bb (single_exit (inner
)->src
);
1242 for (gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);)
1244 gimple
*stmt
= gsi_stmt (gsi
);
1246 if (oloop_exit_bb
== bb
1247 && stmt
== gsi_stmt (gsi_last_bb (oloop_exit_bb
)))
1253 if (gimple_vuse (stmt
))
1254 gimple_set_vuse (stmt
, NULL_TREE
);
1255 if (gimple_vdef (stmt
))
1257 unlink_stmt_vdef (stmt
);
1258 release_ssa_name (gimple_vdef (stmt
));
1259 gimple_set_vdef (stmt
, NULL_TREE
);
1262 reset_debug_uses (stmt
);
1263 gsi_move_before (&gsi
, &to
);
1268 /* Given data reference DR in LOOP_NEST, the function computes DR's access
1269 stride at each level of loop from innermost LOOP to outer. On success,
1270 it saves access stride at each level loop in a vector which is pointed
1271 by DR->aux. For example:
1273 int arr[100][100][100];
1274 for (i = 0; i < 100; i++) ;(DR->aux)strides[0] = 40000
1275 for (j = 100; j > 0; j--) ;(DR->aux)strides[1] = 400
1276 for (k = 0; k < 100; k++) ;(DR->aux)strides[2] = 4
1277 arr[i][j - 1][k] = 0; */
1280 compute_access_stride (struct loop
*loop_nest
, struct loop
*loop
,
1281 data_reference_p dr
)
1283 vec
<tree
> *strides
= new vec
<tree
> ();
1284 basic_block bb
= gimple_bb (DR_STMT (dr
));
1286 while (!flow_bb_inside_loop_p (loop
, bb
))
1288 strides
->safe_push (build_int_cst (sizetype
, 0));
1289 loop
= loop_outer (loop
);
1291 gcc_assert (loop
== bb
->loop_father
);
1293 tree ref
= DR_REF (dr
);
1294 if (TREE_CODE (ref
) == COMPONENT_REF
1295 && DECL_BIT_FIELD (TREE_OPERAND (ref
, 1)))
1297 /* We can't take address of bitfields. If the bitfield is at constant
1298 offset from the start of the struct, just use address of the
1299 struct, for analysis of the strides that shouldn't matter. */
1300 if (!TREE_OPERAND (ref
, 2)
1301 || TREE_CODE (TREE_OPERAND (ref
, 2)) == INTEGER_CST
)
1302 ref
= TREE_OPERAND (ref
, 0);
1303 /* Otherwise, if we have a bit field representative, use that. */
1304 else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref
, 1))
1307 tree repr
= DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref
, 1));
1308 ref
= build3 (COMPONENT_REF
, TREE_TYPE (repr
), TREE_OPERAND (ref
, 0),
1309 repr
, TREE_OPERAND (ref
, 2));
1311 /* Otherwise punt. */
1318 tree scev_base
= build_fold_addr_expr (ref
);
1319 tree scev
= analyze_scalar_evolution (loop
, scev_base
);
1320 scev
= instantiate_scev (loop_preheader_edge (loop_nest
), loop
, scev
);
1321 if (! chrec_contains_undetermined (scev
))
1324 struct loop
*expected
= loop
;
1325 while (TREE_CODE (sl
) == POLYNOMIAL_CHREC
)
1327 struct loop
*sl_loop
= get_chrec_loop (sl
);
1328 while (sl_loop
!= expected
)
1330 strides
->safe_push (size_int (0));
1331 expected
= loop_outer (expected
);
1333 strides
->safe_push (CHREC_RIGHT (sl
));
1334 sl
= CHREC_LEFT (sl
);
1335 expected
= loop_outer (expected
);
1337 if (! tree_contains_chrecs (sl
, NULL
))
1338 while (expected
!= loop_outer (loop_nest
))
1340 strides
->safe_push (size_int (0));
1341 expected
= loop_outer (expected
);
1348 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
1349 access strides with respect to each level loop for all data refs in
1350 DATAREFS from inner loop to outer loop. On success, it returns the
1351 outermost loop that access strides can be computed successfully for
1352 all data references. If access strides cannot be computed at least
1353 for two levels of loop for any data reference, it returns NULL. */
1355 static struct loop
*
1356 compute_access_strides (struct loop
*loop_nest
, struct loop
*loop
,
1357 vec
<data_reference_p
> datarefs
)
1359 unsigned i
, j
, num_loops
= (unsigned) -1;
1360 data_reference_p dr
;
1363 for (i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1365 compute_access_stride (loop_nest
, loop
, dr
);
1366 stride
= DR_ACCESS_STRIDE (dr
);
1367 if (stride
->length () < num_loops
)
1369 num_loops
= stride
->length ();
1375 for (i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1377 stride
= DR_ACCESS_STRIDE (dr
);
1378 if (stride
->length () > num_loops
)
1379 stride
->truncate (num_loops
);
1381 for (j
= 0; j
< (num_loops
>> 1); ++j
)
1382 std::swap ((*stride
)[j
], (*stride
)[num_loops
- j
- 1]);
1385 loop
= superloop_at_depth (loop
, loop_depth (loop
) + 1 - num_loops
);
1386 gcc_assert (loop_nest
== loop
|| flow_loop_nested_p (loop_nest
, loop
));
1390 /* Prune access strides for data references in DATAREFS by removing strides
1391 of loops that isn't in current LOOP_NEST. */
1394 prune_access_strides_not_in_loop (struct loop
*loop_nest
,
1395 struct loop
*innermost
,
1396 vec
<data_reference_p
> datarefs
)
1398 data_reference_p dr
;
1399 unsigned num_loops
= loop_depth (innermost
) - loop_depth (loop_nest
) + 1;
1400 gcc_assert (num_loops
> 1);
1402 /* Block remove strides of loops that is not in current loop nest. */
1403 for (unsigned i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1405 vec
<tree
> *stride
= DR_ACCESS_STRIDE (dr
);
1406 if (stride
->length () > num_loops
)
1407 stride
->block_remove (0, stride
->length () - num_loops
);
1411 /* Dump access strides for all DATAREFS. */
1414 dump_access_strides (vec
<data_reference_p
> datarefs
)
1416 data_reference_p dr
;
1417 fprintf (dump_file
, "Access Strides for DRs:\n");
1418 for (unsigned i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1420 fprintf (dump_file
, " ");
1421 print_generic_expr (dump_file
, DR_REF (dr
), TDF_SLIM
);
1422 fprintf (dump_file
, ":\t\t<");
1424 vec
<tree
> *stride
= DR_ACCESS_STRIDE (dr
);
1425 unsigned num_loops
= stride
->length ();
1426 for (unsigned j
= 0; j
< num_loops
; ++j
)
1428 print_generic_expr (dump_file
, (*stride
)[j
], TDF_SLIM
);
1429 fprintf (dump_file
, "%s", (j
< num_loops
- 1) ? ",\t" : ">\n");
1434 /* Return true if it's profitable to interchange two loops whose index
1435 in whole loop nest vector are I_IDX/O_IDX respectively. The function
1436 computes and compares three types information from all DATAREFS:
1437 1) Access stride for loop I_IDX and O_IDX.
1438 2) Number of invariant memory references with respect to I_IDX before
1439 and after loop interchange.
1440 3) Flags indicating if all memory references access sequential memory
1441 in ILOOP, before and after loop interchange.
1442 If INNMOST_LOOP_P is true, the two loops for interchanging are the two
1443 innermost loops in loop nest. This function also dumps information if
1444 DUMP_INFO_P is true. */
1447 should_interchange_loops (unsigned i_idx
, unsigned o_idx
,
1448 vec
<data_reference_p
> datarefs
,
1449 bool innermost_loops_p
, bool dump_info_p
= true)
1451 unsigned HOST_WIDE_INT ratio
;
1452 unsigned i
, j
, num_old_inv_drs
= 0, num_new_inv_drs
= 0;
1453 struct data_reference
*dr
;
1454 bool all_seq_dr_before_p
= true, all_seq_dr_after_p
= true;
1455 widest_int iloop_strides
= 0, oloop_strides
= 0;
1456 unsigned num_unresolved_drs
= 0;
1457 unsigned num_resolved_ok_drs
= 0;
1458 unsigned num_resolved_not_ok_drs
= 0;
1460 if (dump_info_p
&& dump_file
&& (dump_flags
& TDF_DETAILS
))
1461 fprintf (dump_file
, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
1463 for (i
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1465 vec
<tree
> *stride
= DR_ACCESS_STRIDE (dr
);
1466 tree iloop_stride
= (*stride
)[i_idx
], oloop_stride
= (*stride
)[o_idx
];
1468 bool subloop_stride_p
= false;
1469 /* Data ref can't be invariant or sequential access at current loop if
1470 its address changes with respect to any subloops. */
1471 for (j
= i_idx
+ 1; j
< stride
->length (); ++j
)
1472 if (!integer_zerop ((*stride
)[j
]))
1474 subloop_stride_p
= true;
1478 if (integer_zerop (iloop_stride
))
1480 if (!subloop_stride_p
)
1483 if (integer_zerop (oloop_stride
))
1485 if (!subloop_stride_p
)
1489 if (TREE_CODE (iloop_stride
) == INTEGER_CST
1490 && TREE_CODE (oloop_stride
) == INTEGER_CST
)
1492 iloop_strides
= wi::add (iloop_strides
, wi::to_widest (iloop_stride
));
1493 oloop_strides
= wi::add (oloop_strides
, wi::to_widest (oloop_stride
));
1495 else if (multiple_of_p (TREE_TYPE (iloop_stride
),
1496 iloop_stride
, oloop_stride
))
1497 num_resolved_ok_drs
++;
1498 else if (multiple_of_p (TREE_TYPE (iloop_stride
),
1499 oloop_stride
, iloop_stride
))
1500 num_resolved_not_ok_drs
++;
1502 num_unresolved_drs
++;
1504 /* Data ref can't be sequential access if its address changes in sub
1506 if (subloop_stride_p
)
1508 all_seq_dr_before_p
= false;
1509 all_seq_dr_after_p
= false;
1512 /* Track if all data references are sequential accesses before/after loop
1513 interchange. Note invariant is considered sequential here. */
1514 tree access_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
1515 if (all_seq_dr_before_p
1516 && ! (integer_zerop (iloop_stride
)
1517 || operand_equal_p (access_size
, iloop_stride
, 0)))
1518 all_seq_dr_before_p
= false;
1519 if (all_seq_dr_after_p
1520 && ! (integer_zerop (oloop_stride
)
1521 || operand_equal_p (access_size
, oloop_stride
, 0)))
1522 all_seq_dr_after_p
= false;
1525 if (dump_info_p
&& dump_file
&& (dump_flags
& TDF_DETAILS
))
1527 fprintf (dump_file
, "\toverall:\t\t");
1528 print_decu (iloop_strides
, dump_file
);
1529 fprintf (dump_file
, "\t");
1530 print_decu (oloop_strides
, dump_file
);
1531 fprintf (dump_file
, "\n");
1533 fprintf (dump_file
, "Invariant data ref: before(%d), after(%d)\n",
1534 num_old_inv_drs
, num_new_inv_drs
);
1535 fprintf (dump_file
, "All consecutive stride: before(%s), after(%s)\n",
1536 all_seq_dr_before_p
? "true" : "false",
1537 all_seq_dr_after_p
? "true" : "false");
1538 fprintf (dump_file
, "OK to interchage with variable strides: %d\n",
1539 num_resolved_ok_drs
);
1540 fprintf (dump_file
, "Not OK to interchage with variable strides: %d\n",
1541 num_resolved_not_ok_drs
);
1542 fprintf (dump_file
, "Variable strides we cannot decide: %d\n",
1543 num_unresolved_drs
);
1546 if (num_unresolved_drs
!= 0 || num_resolved_not_ok_drs
!= 0)
1549 /* We use different stride comparison ratio for interchanging innermost
1550 two loops or not. The idea is to be conservative in interchange for
1551 the innermost loops. */
1552 ratio
= innermost_loops_p
? INNER_STRIDE_RATIO
: OUTER_STRIDE_RATIO
;
1553 /* Do interchange if it gives better data locality behavior. */
1554 if (wi::gtu_p (iloop_strides
, wi::mul (oloop_strides
, ratio
)))
1556 if (wi::gtu_p (iloop_strides
, oloop_strides
))
1558 /* Or it creates more invariant memory references. */
1559 if ((!all_seq_dr_before_p
|| all_seq_dr_after_p
)
1560 && num_new_inv_drs
> num_old_inv_drs
)
1562 /* Or it makes all memory references sequential. */
1563 if (num_new_inv_drs
>= num_old_inv_drs
1564 && !all_seq_dr_before_p
&& all_seq_dr_after_p
)
1571 /* Try to interchange inner loop of a loop nest to outer level. */
1574 tree_loop_interchange::interchange (vec
<data_reference_p
> datarefs
,
1577 location_t loc
= find_loop_location (m_loop_nest
[0]);
1578 bool changed_p
= false;
1579 /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
1580 The overall effect is to push inner loop to outermost level in whole
1582 for (unsigned i
= m_loop_nest
.length (); i
> 1; --i
)
1584 unsigned i_idx
= i
- 1, o_idx
= i
- 2;
1586 /* Check validity for loop interchange. */
1587 if (!valid_data_dependences (i_idx
, o_idx
, ddrs
))
1590 loop_cand
iloop (m_loop_nest
[i_idx
], m_loop_nest
[o_idx
]);
1591 loop_cand
oloop (m_loop_nest
[o_idx
], m_loop_nest
[o_idx
]);
1593 /* Check if we can do transformation for loop interchange. */
1594 if (!iloop
.analyze_carried_vars (NULL
)
1595 || !iloop
.analyze_lcssa_phis ()
1596 || !oloop
.analyze_carried_vars (&iloop
)
1597 || !oloop
.analyze_lcssa_phis ()
1598 || !iloop
.can_interchange_p (NULL
)
1599 || !oloop
.can_interchange_p (&iloop
))
1602 /* Check profitability for loop interchange. */
1603 if (should_interchange_loops (i_idx
, o_idx
, datarefs
,
1604 iloop
.m_loop
->inner
== NULL
))
1606 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1608 "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
1609 oloop
.m_loop
->num
, iloop
.m_loop
->num
);
1612 interchange_loops (iloop
, oloop
);
1613 /* No need to update if there is no further loop interchange. */
1615 update_data_info (i_idx
, o_idx
, datarefs
, ddrs
);
1619 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1621 "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
1622 oloop
.m_loop
->num
, iloop
.m_loop
->num
);
1625 simple_dce_from_worklist (m_dce_seeds
);
1628 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, loc
,
1629 "loops interchanged in loop nest\n");
1635 /* Loop interchange pass. */
1639 const pass_data pass_data_linterchange
=
1641 GIMPLE_PASS
, /* type */
1642 "linterchange", /* name */
1643 OPTGROUP_LOOP
, /* optinfo_flags */
1644 TV_LINTERCHANGE
, /* tv_id */
1645 PROP_cfg
, /* properties_required */
1646 0, /* properties_provided */
1647 0, /* properties_destroyed */
1648 0, /* todo_flags_start */
1649 0, /* todo_flags_finish */
1652 class pass_linterchange
: public gimple_opt_pass
1655 pass_linterchange (gcc::context
*ctxt
)
1656 : gimple_opt_pass (pass_data_linterchange
, ctxt
)
1659 /* opt_pass methods: */
1660 opt_pass
* clone () { return new pass_linterchange (m_ctxt
); }
1661 virtual bool gate (function
*) { return flag_loop_interchange
; }
1662 virtual unsigned int execute (function
*);
1664 }; // class pass_linterchange
1667 /* Return true if LOOP has proper form for interchange. We check three
1668 conditions in the function:
1669 1) In general, a loop can be interchanged only if it doesn't have
1670 basic blocks other than header, exit and latch besides possible
1671 inner loop nest. This basically restricts loop interchange to
1672 below form loop nests:
1682 2) Data reference in basic block that executes in different times
1683 than loop head/exit is not allowed.
1684 3) Record the innermost outer loop that doesn't form rectangle loop
1688 proper_loop_form_for_interchange (struct loop
*loop
, struct loop
**min_outer
)
1692 /* Don't interchange if loop has unsupported information for the moment. */
1693 if (loop
->safelen
> 0
1694 || loop
->constraints
!= 0
1695 || loop
->can_be_parallel
1696 || loop
->dont_vectorize
1697 || loop
->force_vectorize
1698 || loop
->in_oacc_kernels_region
1699 || loop
->orig_loop_num
!= 0
1700 || loop
->simduid
!= NULL_TREE
)
1703 /* Don't interchange if outer loop has basic block other than header, exit
1705 if (loop
->inner
!= NULL
1706 && loop
->num_nodes
!= loop
->inner
->num_nodes
+ 3)
1709 if ((exit
= single_dom_exit (loop
)) == NULL
)
1712 /* Check control flow on loop header/exit blocks. */
1713 if (loop
->header
== exit
->src
1714 && (EDGE_COUNT (loop
->header
->preds
) != 2
1715 || EDGE_COUNT (loop
->header
->succs
) != 2))
1717 else if (loop
->header
!= exit
->src
1718 && (EDGE_COUNT (loop
->header
->preds
) != 2
1719 || !single_succ_p (loop
->header
)
1720 || unsupported_edge (single_succ_edge (loop
->header
))
1721 || EDGE_COUNT (exit
->src
->succs
) != 2
1722 || !single_pred_p (exit
->src
)
1723 || unsupported_edge (single_pred_edge (exit
->src
))))
1726 e0
= EDGE_PRED (loop
->header
, 0);
1727 e1
= EDGE_PRED (loop
->header
, 1);
1728 if (unsupported_edge (e0
) || unsupported_edge (e1
)
1729 || (e0
->src
!= loop
->latch
&& e1
->src
!= loop
->latch
)
1730 || (e0
->src
->loop_father
== loop
&& e1
->src
->loop_father
== loop
))
1733 e0
= EDGE_SUCC (exit
->src
, 0);
1734 e1
= EDGE_SUCC (exit
->src
, 1);
1735 if (unsupported_edge (e0
) || unsupported_edge (e1
)
1736 || (e0
->dest
!= loop
->latch
&& e1
->dest
!= loop
->latch
)
1737 || (e0
->dest
->loop_father
== loop
&& e1
->dest
->loop_father
== loop
))
1740 /* Don't interchange if any reference is in basic block that doesn't
1741 dominate exit block. */
1742 basic_block
*bbs
= get_loop_body (loop
);
1743 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1745 basic_block bb
= bbs
[i
];
1747 if (bb
->loop_father
!= loop
1748 || bb
== loop
->header
|| bb
== exit
->src
1749 || dominated_by_p (CDI_DOMINATORS
, exit
->src
, bb
))
1752 for (gimple_stmt_iterator gsi
= gsi_start_bb_nondebug (bb
);
1753 !gsi_end_p (gsi
); gsi_next_nondebug (&gsi
))
1754 if (gimple_vuse (gsi_stmt (gsi
)))
1762 tree niters
= number_of_latch_executions (loop
);
1763 niters
= analyze_scalar_evolution (loop_outer (loop
), niters
);
1764 if (!niters
|| chrec_contains_undetermined (niters
))
1767 /* Record the innermost outer loop that doesn't form rectangle loop nest. */
1768 for (loop_p loop2
= loop_outer (loop
);
1769 loop2
&& flow_loop_nested_p (*min_outer
, loop2
);
1770 loop2
= loop_outer (loop2
))
1772 niters
= instantiate_scev (loop_preheader_edge (loop2
),
1773 loop_outer (loop
), niters
);
1774 if (!evolution_function_is_invariant_p (niters
, loop2
->num
))
1783 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
1784 should be interchanged by looking into all DATAREFS. */
1787 should_interchange_loop_nest (struct loop
*loop_nest
, struct loop
*innermost
,
1788 vec
<data_reference_p
> datarefs
)
1790 unsigned idx
= loop_depth (innermost
) - loop_depth (loop_nest
);
1791 gcc_assert (idx
> 0);
1793 /* Check if any two adjacent loops should be interchanged. */
1794 for (struct loop
*loop
= innermost
;
1795 loop
!= loop_nest
; loop
= loop_outer (loop
), idx
--)
1796 if (should_interchange_loops (idx
, idx
- 1, datarefs
,
1797 loop
== innermost
, false))
1803 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
1804 dependences for loop interchange and store it in DDRS. Note we compute
1805 dependences directly rather than call generic interface so that we can
1806 return on unknown dependence instantly. */
1809 tree_loop_interchange_compute_ddrs (vec
<loop_p
> loop_nest
,
1810 vec
<data_reference_p
> datarefs
,
1813 struct data_reference
*a
, *b
;
1814 struct loop
*innermost
= loop_nest
.last ();
1816 for (unsigned i
= 0; datarefs
.iterate (i
, &a
); ++i
)
1818 bool a_outer_p
= gimple_bb (DR_STMT (a
))->loop_father
!= innermost
;
1819 for (unsigned j
= i
+ 1; datarefs
.iterate (j
, &b
); ++j
)
1820 if (DR_IS_WRITE (a
) || DR_IS_WRITE (b
))
1822 bool b_outer_p
= gimple_bb (DR_STMT (b
))->loop_father
!= innermost
;
1823 /* Don't support multiple write references in outer loop. */
1824 if (a_outer_p
&& b_outer_p
&& DR_IS_WRITE (a
) && DR_IS_WRITE (b
))
1827 ddr_p ddr
= initialize_data_dependence_relation (a
, b
, loop_nest
);
1828 ddrs
->safe_push (ddr
);
1829 compute_affine_dependence (ddr
, loop_nest
[0]);
1831 /* Give up if ddr is unknown dependence or classic direct vector
1832 is not available. */
1833 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
1834 || (DDR_ARE_DEPENDENT (ddr
) == NULL_TREE
1835 && DDR_NUM_DIR_VECTS (ddr
) == 0))
1838 /* If either data references is in outer loop of nest, we require
1839 no dependence here because the data reference need to be moved
1840 into inner loop during interchange. */
1841 if (a_outer_p
&& b_outer_p
1842 && operand_equal_p (DR_REF (a
), DR_REF (b
), 0))
1844 if (DDR_ARE_DEPENDENT (ddr
) != chrec_known
1845 && (a_outer_p
|| b_outer_p
))
1853 /* Prune DATAREFS by removing any data reference not inside of LOOP. */
1856 prune_datarefs_not_in_loop (struct loop
*loop
, vec
<data_reference_p
> datarefs
)
1859 struct data_reference
*dr
;
1861 for (i
= 0, j
= 0; datarefs
.iterate (i
, &dr
); ++i
)
1863 if (flow_bb_inside_loop_p (loop
, gimple_bb (DR_STMT (dr
))))
1869 DR_ACCESS_STRIDE (dr
)->release ();
1875 datarefs
.truncate (j
);
1878 /* Find and store data references in DATAREFS for LOOP nest. If there's
1879 difficult data reference in a basic block, we shrink the loop nest to
1880 inner loop of that basic block's father loop. On success, return the
1881 outer loop of the result loop nest. */
1883 static struct loop
*
1884 prepare_data_references (struct loop
*loop
, vec
<data_reference_p
> *datarefs
)
1886 struct loop
*loop_nest
= loop
;
1887 vec
<data_reference_p
> *bb_refs
;
1888 basic_block bb
, *bbs
= get_loop_body_in_dom_order (loop
);
1890 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1893 /* Find data references for all basic blocks. Shrink loop nest on difficult
1895 for (unsigned i
= 0; loop_nest
&& i
< loop
->num_nodes
; ++i
)
1898 if (!flow_bb_inside_loop_p (loop_nest
, bb
))
1901 bb_refs
= new vec
<data_reference_p
> ();
1902 if (find_data_references_in_bb (loop
, bb
, bb_refs
) == chrec_dont_know
)
1904 loop_nest
= bb
->loop_father
->inner
;
1905 if (loop_nest
&& !loop_nest
->inner
)
1908 free_data_refs (*bb_refs
);
1911 else if (bb_refs
->is_empty ())
1917 /* Collect all data references in loop nest. */
1918 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1924 bb_refs
= (vec
<data_reference_p
> *) bb
->aux
;
1925 if (loop_nest
&& flow_bb_inside_loop_p (loop_nest
, bb
))
1926 datarefs
->safe_splice (*bb_refs
);
1928 free_data_refs (*bb_refs
);
1938 /* Given innermost LOOP, return true if perfect loop nest can be found and
1939 data dependences can be computed. If succeed, record the perfect loop
1940 nest in LOOP_NEST; record all data references in DATAREFS and record all
1941 data dependence relations in DDRS.
1943 We do support a restricted form of imperfect loop nest, i.e, loop nest
1944 with load/store in outer loop initializing/finalizing simple reduction
1945 of the innermost loop. For such outer loop reference, we require that
1946 it has no dependence with others sinve it will be moved to inner loop
1950 prepare_perfect_loop_nest (struct loop
*loop
, vec
<loop_p
> *loop_nest
,
1951 vec
<data_reference_p
> *datarefs
, vec
<ddr_p
> *ddrs
)
1953 struct loop
*start_loop
= NULL
, *innermost
= loop
;
1954 struct loop
*outermost
= loops_for_fn (cfun
)->tree_root
;
1956 /* Find loop nest from the innermost loop. The outermost is the innermost
1958 while (loop
->num
!= 0 && loop
->inner
== start_loop
1959 && flow_loop_nested_p (outermost
, loop
))
1961 if (!proper_loop_form_for_interchange (loop
, &outermost
))
1965 /* If this loop has sibling loop, the father loop won't be in perfect
1967 if (loop
->next
!= NULL
)
1970 loop
= loop_outer (loop
);
1972 if (!start_loop
|| !start_loop
->inner
)
1975 /* Prepare the data reference vector for the loop nest, pruning outer
1976 loops we cannot handle. */
1977 start_loop
= prepare_data_references (start_loop
, datarefs
);
1979 /* Check if there is no data reference. */
1980 || datarefs
->is_empty ()
1981 /* Check if there are too many of data references. */
1982 || (int) datarefs
->length () > MAX_DATAREFS
)
1985 /* Compute access strides for all data references, pruning outer
1986 loops we cannot analyze refs in. */
1987 start_loop
= compute_access_strides (start_loop
, innermost
, *datarefs
);
1991 /* Check if any interchange is profitable in the loop nest. */
1992 if (!should_interchange_loop_nest (start_loop
, innermost
, *datarefs
))
1995 /* Check if data dependences can be computed for loop nest starting from
1999 loop_nest
->truncate (0);
2001 if (loop
!= start_loop
)
2002 prune_datarefs_not_in_loop (start_loop
, *datarefs
);
2004 if (find_loop_nest (start_loop
, loop_nest
)
2005 && tree_loop_interchange_compute_ddrs (*loop_nest
, *datarefs
, ddrs
))
2007 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2009 "\nConsider loop interchange for loop_nest<%d - %d>\n",
2010 start_loop
->num
, innermost
->num
);
2012 if (loop
!= start_loop
)
2013 prune_access_strides_not_in_loop (start_loop
, innermost
, *datarefs
);
2015 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2016 dump_access_strides (*datarefs
);
2021 free_dependence_relations (*ddrs
);
2023 /* Try to compute data dependences with the outermost loop stripped. */
2025 start_loop
= start_loop
->inner
;
2026 } while (start_loop
&& start_loop
->inner
);
2031 /* Main entry for loop interchange pass. */
2034 pass_linterchange::execute (function
*fun
)
2036 if (number_of_loops (fun
) <= 2)
2039 bool changed_p
= false;
2041 FOR_EACH_LOOP (loop
, LI_ONLY_INNERMOST
)
2043 vec
<loop_p
> loop_nest
= vNULL
;
2044 vec
<data_reference_p
> datarefs
= vNULL
;
2045 vec
<ddr_p
> ddrs
= vNULL
;
2046 if (prepare_perfect_loop_nest (loop
, &loop_nest
, &datarefs
, &ddrs
))
2048 tree_loop_interchange
loop_interchange (loop_nest
);
2049 changed_p
|= loop_interchange
.interchange (datarefs
, ddrs
);
2051 free_dependence_relations (ddrs
);
2052 free_data_refs_with_aux (datarefs
);
2053 loop_nest
.release ();
2059 return changed_p
? (TODO_update_ssa_only_virtuals
) : 0;
2065 make_pass_linterchange (gcc::context
*ctxt
)
2067 return new pass_linterchange (ctxt
);