1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #include "coretypes.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
29 #include "cfglayout.h"
36 /* This pass performs loop unrolling and peeling. We only perform these
37 optimizations on innermost loops (with single exception) because
38 the impact on performance is greatest here, and we want to avoid
39 unnecessary code size growth. The gain is caused by greater sequentiality
40 of code, better code to optimize for further passes and in some cases
41 by fewer testings of exit conditions. The main problem is code growth,
42 that impacts performance negatively due to effect of caches.
46 -- complete peeling of once-rolling loops; this is the above mentioned
47 exception, as this causes loop to be cancelled completely and
48 does not cause code growth
49 -- complete peeling of loops that roll (small) constant times.
50 -- simple peeling of first iterations of loops that do not roll much
51 (according to profile feedback)
52 -- unrolling of loops that roll constant times; this is almost always
53 win, as we get rid of exit condition tests.
54 -- unrolling of loops that roll number of times that we can compute
55 in runtime; we also get rid of exit condition tests here, but there
56 is the extra expense for calculating the number of iterations
57 -- simple unrolling of remaining loops; this is performed only if we
58 are asked to, as the gain is questionable in this case and often
59 it may even slow down the code
60 For more detailed descriptions of each of those, see comments at
61 appropriate function below.
63 There is a lot of parameters (defined and described in params.def) that
64 control how much we unroll/peel.
66 ??? A great problem is that we don't have a good way how to determine
67 how many times we should unroll the loop; the experiments I have made
68 showed that this choice may affect performance in order of several %.
71 /* Information about induction variables to split. */
75 rtx insn
; /* The insn in that the induction variable occurs. */
76 rtx base_var
; /* The variable on that the values in the further
77 iterations are based. */
78 rtx step
; /* Step of the induction variable. */
80 unsigned loc
[3]; /* Location where the definition of the induction
81 variable occurs in the insn. For example if
82 N_LOC is 2, the expression is located at
83 XEXP (XEXP (single_set, loc[0]), loc[1]). */
88 htab_t insns_to_split
; /* A hashtable of insns to split. */
89 unsigned first_new_block
; /* The first basic block that was
93 static void decide_unrolling_and_peeling (struct loops
*, int);
94 static void peel_loops_completely (struct loops
*, int);
95 static void decide_peel_simple (struct loop
*, int);
96 static void decide_peel_once_rolling (struct loop
*, int);
97 static void decide_peel_completely (struct loop
*, int);
98 static void decide_unroll_stupid (struct loop
*, int);
99 static void decide_unroll_constant_iterations (struct loop
*, int);
100 static void decide_unroll_runtime_iterations (struct loop
*, int);
101 static void peel_loop_simple (struct loops
*, struct loop
*);
102 static void peel_loop_completely (struct loops
*, struct loop
*);
103 static void unroll_loop_stupid (struct loops
*, struct loop
*);
104 static void unroll_loop_constant_iterations (struct loops
*, struct loop
*);
105 static void unroll_loop_runtime_iterations (struct loops
*, struct loop
*);
106 static struct split_ivs_info
*analyze_ivs_to_split (struct loop
*);
107 static void si_info_start_duplication (struct split_ivs_info
*);
108 static void split_ivs_in_copies (struct split_ivs_info
*, unsigned, bool, bool);
109 static void free_si_info (struct split_ivs_info
*);
111 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
113 unroll_and_peel_loops (struct loops
*loops
, int flags
)
115 struct loop
*loop
, *next
;
118 /* First perform complete loop peeling (it is almost surely a win,
119 and affects parameters for further decision a lot). */
120 peel_loops_completely (loops
, flags
);
122 /* Now decide rest of unrolling and peeling. */
123 decide_unrolling_and_peeling (loops
, flags
);
125 loop
= loops
->tree_root
;
129 /* Scan the loops, inner ones first. */
130 while (loop
!= loops
->tree_root
)
142 /* And perform the appropriate transformations. */
143 switch (loop
->lpt_decision
.decision
)
145 case LPT_PEEL_COMPLETELY
:
148 case LPT_PEEL_SIMPLE
:
149 peel_loop_simple (loops
, loop
);
151 case LPT_UNROLL_CONSTANT
:
152 unroll_loop_constant_iterations (loops
, loop
);
154 case LPT_UNROLL_RUNTIME
:
155 unroll_loop_runtime_iterations (loops
, loop
);
157 case LPT_UNROLL_STUPID
:
158 unroll_loop_stupid (loops
, loop
);
168 #ifdef ENABLE_CHECKING
169 verify_dominators (CDI_DOMINATORS
);
170 verify_loop_structure (loops
);
179 /* Check whether exit of the LOOP is at the end of loop body. */
182 loop_exit_at_end_p (struct loop
*loop
)
184 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
187 if (desc
->in_edge
->dest
!= loop
->latch
)
190 /* Check that the latch is empty. */
191 FOR_BB_INSNS (loop
->latch
, insn
)
200 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
202 peel_loops_completely (struct loops
*loops
, int flags
)
204 struct loop
*loop
, *next
;
206 loop
= loops
->tree_root
;
210 while (loop
!= loops
->tree_root
)
221 loop
->lpt_decision
.decision
= LPT_NONE
;
225 "\n;; *** Considering loop %d for complete peeling ***\n",
228 loop
->ninsns
= num_loop_insns (loop
);
230 decide_peel_once_rolling (loop
, flags
);
231 if (loop
->lpt_decision
.decision
== LPT_NONE
)
232 decide_peel_completely (loop
, flags
);
234 if (loop
->lpt_decision
.decision
== LPT_PEEL_COMPLETELY
)
236 peel_loop_completely (loops
, loop
);
237 #ifdef ENABLE_CHECKING
238 verify_dominators (CDI_DOMINATORS
);
239 verify_loop_structure (loops
);
246 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
248 decide_unrolling_and_peeling (struct loops
*loops
, int flags
)
250 struct loop
*loop
= loops
->tree_root
, *next
;
255 /* Scan the loops, inner ones first. */
256 while (loop
!= loops
->tree_root
)
267 loop
->lpt_decision
.decision
= LPT_NONE
;
270 fprintf (dump_file
, "\n;; *** Considering loop %d ***\n", loop
->num
);
272 /* Do not peel cold areas. */
273 if (!maybe_hot_bb_p (loop
->header
))
276 fprintf (dump_file
, ";; Not considering loop, cold area\n");
281 /* Can the loop be manipulated? */
282 if (!can_duplicate_loop_p (loop
))
286 ";; Not considering loop, cannot duplicate\n");
291 /* Skip non-innermost loops. */
295 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
300 loop
->ninsns
= num_loop_insns (loop
);
301 loop
->av_ninsns
= average_num_loop_insns (loop
);
303 /* Try transformations one by one in decreasing order of
306 decide_unroll_constant_iterations (loop
, flags
);
307 if (loop
->lpt_decision
.decision
== LPT_NONE
)
308 decide_unroll_runtime_iterations (loop
, flags
);
309 if (loop
->lpt_decision
.decision
== LPT_NONE
)
310 decide_unroll_stupid (loop
, flags
);
311 if (loop
->lpt_decision
.decision
== LPT_NONE
)
312 decide_peel_simple (loop
, flags
);
318 /* Decide whether the LOOP is once rolling and suitable for complete
321 decide_peel_once_rolling (struct loop
*loop
, int flags ATTRIBUTE_UNUSED
)
323 struct niter_desc
*desc
;
326 fprintf (dump_file
, "\n;; Considering peeling once rolling loop\n");
328 /* Is the loop small enough? */
329 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS
) < loop
->ninsns
)
332 fprintf (dump_file
, ";; Not considering loop, is too big\n");
336 /* Check for simple loops. */
337 desc
= get_simple_loop_desc (loop
);
339 /* Check number of iterations. */
347 ";; Unable to prove that the loop rolls exactly once\n");
353 fprintf (dump_file
, ";; Decided to peel exactly once rolling loop\n");
354 loop
->lpt_decision
.decision
= LPT_PEEL_COMPLETELY
;
357 /* Decide whether the LOOP is suitable for complete peeling. */
359 decide_peel_completely (struct loop
*loop
, int flags ATTRIBUTE_UNUSED
)
362 struct niter_desc
*desc
;
365 fprintf (dump_file
, "\n;; Considering peeling completely\n");
367 /* Skip non-innermost loops. */
371 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
375 /* Do not peel cold areas. */
376 if (!maybe_hot_bb_p (loop
->header
))
379 fprintf (dump_file
, ";; Not considering loop, cold area\n");
383 /* Can the loop be manipulated? */
384 if (!can_duplicate_loop_p (loop
))
388 ";; Not considering loop, cannot duplicate\n");
392 /* npeel = number of iterations to peel. */
393 npeel
= PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS
) / loop
->ninsns
;
394 if (npeel
> (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES
))
395 npeel
= PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES
);
397 /* Is the loop small enough? */
401 fprintf (dump_file
, ";; Not considering loop, is too big\n");
405 /* Check for simple loops. */
406 desc
= get_simple_loop_desc (loop
);
408 /* Check number of iterations. */
411 || !desc
->const_iter
)
415 ";; Unable to prove that the loop iterates constant times\n");
419 if (desc
->niter
> npeel
- 1)
424 ";; Not peeling loop completely, rolls too much (");
425 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
, desc
->niter
);
426 fprintf (dump_file
, " iterations > %d [maximum peelings])\n", npeel
);
433 fprintf (dump_file
, ";; Decided to peel loop completely\n");
434 loop
->lpt_decision
.decision
= LPT_PEEL_COMPLETELY
;
437 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
438 completely. The transformation done:
440 for (i = 0; i < 4; i++)
452 peel_loop_completely (struct loops
*loops
, struct loop
*loop
)
455 unsigned HOST_WIDE_INT npeel
;
456 unsigned n_remove_edges
, i
;
457 edge
*remove_edges
, ei
;
458 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
459 struct split_ivs_info
*si_info
= NULL
;
465 wont_exit
= sbitmap_alloc (npeel
+ 1);
466 sbitmap_ones (wont_exit
);
467 RESET_BIT (wont_exit
, 0);
468 if (desc
->noloop_assumptions
)
469 RESET_BIT (wont_exit
, 1);
471 remove_edges
= xcalloc (npeel
, sizeof (edge
));
474 if (flag_split_ivs_in_unroller
)
475 si_info
= analyze_ivs_to_split (loop
);
477 si_info_start_duplication (si_info
);
478 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
480 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
481 DLTHE_FLAG_UPDATE_FREQ
))
488 split_ivs_in_copies (si_info
, npeel
, false, true);
489 free_si_info (si_info
);
492 /* Remove the exit edges. */
493 for (i
= 0; i
< n_remove_edges
; i
++)
494 remove_path (loops
, remove_edges
[i
]);
499 free_simple_loop_desc (loop
);
501 /* Now remove the unreachable part of the last iteration and cancel
503 remove_path (loops
, ei
);
506 fprintf (dump_file
, ";; Peeled loop completely, %d times\n", (int) npeel
);
509 /* Decide whether to unroll LOOP iterating constant number of times
513 decide_unroll_constant_iterations (struct loop
*loop
, int flags
)
515 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
516 struct niter_desc
*desc
;
518 if (!(flags
& UAP_UNROLL
))
520 /* We were not asked to, just return back silently. */
526 "\n;; Considering unrolling loop with constant "
527 "number of iterations\n");
529 /* nunroll = total number of copies of the original loop body in
530 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
531 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
533 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
534 if (nunroll
> nunroll_by_av
)
535 nunroll
= nunroll_by_av
;
536 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
537 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
539 /* Skip big loops. */
543 fprintf (dump_file
, ";; Not considering loop, is too big\n");
547 /* Check for simple loops. */
548 desc
= get_simple_loop_desc (loop
);
550 /* Check number of iterations. */
551 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
555 ";; Unable to prove that the loop iterates constant times\n");
559 /* Check whether the loop rolls enough to consider. */
560 if (desc
->niter
< 2 * nunroll
)
563 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
567 /* Success; now compute number of iterations to unroll. We alter
568 nunroll so that as few as possible copies of loop body are
569 necessary, while still not decreasing the number of unrollings
570 too much (at most by 1). */
571 best_copies
= 2 * nunroll
+ 10;
574 if (i
- 1 >= desc
->niter
)
577 for (; i
>= nunroll
- 1; i
--)
579 unsigned exit_mod
= desc
->niter
% (i
+ 1);
581 if (!loop_exit_at_end_p (loop
))
582 n_copies
= exit_mod
+ i
+ 1;
583 else if (exit_mod
!= (unsigned) i
584 || desc
->noloop_assumptions
!= NULL_RTX
)
585 n_copies
= exit_mod
+ i
+ 2;
589 if (n_copies
< best_copies
)
591 best_copies
= n_copies
;
597 fprintf (dump_file
, ";; max_unroll %d (%d copies, initial %d).\n",
598 best_unroll
+ 1, best_copies
, nunroll
);
600 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
601 loop
->lpt_decision
.times
= best_unroll
;
605 ";; Decided to unroll the constant times rolling loop, %d times.\n",
606 loop
->lpt_decision
.times
);
609 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
610 times. The transformation does this:
612 for (i = 0; i < 102; i++)
629 unroll_loop_constant_iterations (struct loops
*loops
, struct loop
*loop
)
631 unsigned HOST_WIDE_INT niter
;
634 unsigned n_remove_edges
, i
;
636 unsigned max_unroll
= loop
->lpt_decision
.times
;
637 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
638 bool exit_at_end
= loop_exit_at_end_p (loop
);
639 struct split_ivs_info
*si_info
= NULL
;
643 /* Should not get here (such loop should be peeled instead). */
644 gcc_assert (niter
> max_unroll
+ 1);
646 exit_mod
= niter
% (max_unroll
+ 1);
648 wont_exit
= sbitmap_alloc (max_unroll
+ 1);
649 sbitmap_ones (wont_exit
);
651 remove_edges
= xcalloc (max_unroll
+ exit_mod
+ 1, sizeof (edge
));
654 if (flag_split_ivs_in_unroller
)
655 si_info
= analyze_ivs_to_split (loop
);
659 /* The exit is not at the end of the loop; leave exit test
660 in the first copy, so that the loops that start with test
661 of exit condition have continuous body after unrolling. */
664 fprintf (dump_file
, ";; Condition on beginning of loop.\n");
666 /* Peel exit_mod iterations. */
667 RESET_BIT (wont_exit
, 0);
668 if (desc
->noloop_assumptions
)
669 RESET_BIT (wont_exit
, 1);
673 si_info_start_duplication (si_info
);
674 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
676 wont_exit
, desc
->out_edge
,
677 remove_edges
, &n_remove_edges
,
678 DLTHE_FLAG_UPDATE_FREQ
))
681 if (si_info
&& exit_mod
> 1)
682 split_ivs_in_copies (si_info
, exit_mod
, false, false);
684 desc
->noloop_assumptions
= NULL_RTX
;
685 desc
->niter
-= exit_mod
;
686 desc
->niter_max
-= exit_mod
;
689 SET_BIT (wont_exit
, 1);
693 /* Leave exit test in last copy, for the same reason as above if
694 the loop tests the condition at the end of loop body. */
697 fprintf (dump_file
, ";; Condition on end of loop.\n");
699 /* We know that niter >= max_unroll + 2; so we do not need to care of
700 case when we would exit before reaching the loop. So just peel
701 exit_mod + 1 iterations. */
702 if (exit_mod
!= max_unroll
703 || desc
->noloop_assumptions
)
705 RESET_BIT (wont_exit
, 0);
706 if (desc
->noloop_assumptions
)
707 RESET_BIT (wont_exit
, 1);
709 si_info_start_duplication (si_info
);
710 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
712 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
713 DLTHE_FLAG_UPDATE_FREQ
))
716 if (si_info
&& exit_mod
> 0)
717 split_ivs_in_copies (si_info
, exit_mod
+ 1, false, false);
719 desc
->niter
-= exit_mod
+ 1;
720 desc
->niter_max
-= exit_mod
+ 1;
721 desc
->noloop_assumptions
= NULL_RTX
;
723 SET_BIT (wont_exit
, 0);
724 SET_BIT (wont_exit
, 1);
727 RESET_BIT (wont_exit
, max_unroll
);
730 /* Now unroll the loop. */
731 si_info_start_duplication (si_info
);
732 if (!duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
734 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
735 DLTHE_FLAG_UPDATE_FREQ
))
740 split_ivs_in_copies (si_info
, max_unroll
, true, true);
741 free_si_info (si_info
);
748 basic_block exit_block
= desc
->in_edge
->src
->rbi
->copy
;
749 /* Find a new in and out edge; they are in the last copy we have made. */
751 if (exit_block
->succ
->dest
== desc
->out_edge
->dest
)
753 desc
->out_edge
= exit_block
->succ
;
754 desc
->in_edge
= exit_block
->succ
->succ_next
;
758 desc
->out_edge
= exit_block
->succ
->succ_next
;
759 desc
->in_edge
= exit_block
->succ
;
763 desc
->niter
/= max_unroll
+ 1;
764 desc
->niter_max
/= max_unroll
+ 1;
765 desc
->niter_expr
= GEN_INT (desc
->niter
);
767 /* Remove the edges. */
768 for (i
= 0; i
< n_remove_edges
; i
++)
769 remove_path (loops
, remove_edges
[i
]);
774 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
775 max_unroll
, num_loop_insns (loop
));
778 /* Decide whether to unroll LOOP iterating runtime computable number of times
781 decide_unroll_runtime_iterations (struct loop
*loop
, int flags
)
783 unsigned nunroll
, nunroll_by_av
, i
;
784 struct niter_desc
*desc
;
786 if (!(flags
& UAP_UNROLL
))
788 /* We were not asked to, just return back silently. */
794 "\n;; Considering unrolling loop with runtime "
795 "computable number of iterations\n");
797 /* nunroll = total number of copies of the original loop body in
798 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
799 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
800 nunroll_by_av
= PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
801 if (nunroll
> nunroll_by_av
)
802 nunroll
= nunroll_by_av
;
803 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
804 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
806 /* Skip big loops. */
810 fprintf (dump_file
, ";; Not considering loop, is too big\n");
814 /* Check for simple loops. */
815 desc
= get_simple_loop_desc (loop
);
817 /* Check simpleness. */
818 if (!desc
->simple_p
|| desc
->assumptions
)
822 ";; Unable to prove that the number of iterations "
823 "can be counted in runtime\n");
827 if (desc
->const_iter
)
830 fprintf (dump_file
, ";; Loop iterates constant times\n");
834 /* If we have profile feedback, check whether the loop rolls. */
835 if (loop
->header
->count
&& expected_loop_iterations (loop
) < 2 * nunroll
)
838 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
842 /* Success; now force nunroll to be power of 2, as we are unable to
843 cope with overflows in computation of number of iterations. */
844 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
847 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
848 loop
->lpt_decision
.times
= i
- 1;
852 ";; Decided to unroll the runtime computable "
853 "times rolling loop, %d times.\n",
854 loop
->lpt_decision
.times
);
857 /* Unroll LOOP for that we are able to count number of iterations in runtime
858 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
859 extra care for case n < 0):
861 for (i = 0; i < n; i++)
889 unroll_loop_runtime_iterations (struct loops
*loops
, struct loop
*loop
)
891 rtx old_niter
, niter
, init_code
, branch_code
, tmp
;
893 basic_block preheader
, *body
, *dom_bbs
, swtch
, ezc_swtch
;
897 unsigned n_peel
, n_remove_edges
;
898 edge
*remove_edges
, e
;
899 bool extra_zero_check
, last_may_exit
;
900 unsigned max_unroll
= loop
->lpt_decision
.times
;
901 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
902 bool exit_at_end
= loop_exit_at_end_p (loop
);
903 struct split_ivs_info
*si_info
= NULL
;
905 if (flag_split_ivs_in_unroller
)
906 si_info
= analyze_ivs_to_split (loop
);
908 /* Remember blocks whose dominators will have to be updated. */
909 dom_bbs
= xcalloc (n_basic_blocks
, sizeof (basic_block
));
912 body
= get_loop_body (loop
);
913 for (i
= 0; i
< loop
->num_nodes
; i
++)
918 nldom
= get_dominated_by (CDI_DOMINATORS
, body
[i
], &ldom
);
919 for (j
= 0; j
< nldom
; j
++)
920 if (!flow_bb_inside_loop_p (loop
, ldom
[j
]))
921 dom_bbs
[n_dom_bbs
++] = ldom
[j
];
929 /* Leave exit in first copy (for explanation why see comment in
930 unroll_loop_constant_iterations). */
932 n_peel
= max_unroll
- 1;
933 extra_zero_check
= true;
934 last_may_exit
= false;
938 /* Leave exit in last copy (for explanation why see comment in
939 unroll_loop_constant_iterations). */
940 may_exit_copy
= max_unroll
;
942 extra_zero_check
= false;
943 last_may_exit
= true;
946 /* Get expression for number of iterations. */
948 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
949 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
951 emit_move_insn (niter
, tmp
);
953 /* Count modulo by ANDing it with max_unroll; we use the fact that
954 the number of unrollings is a power of two, and thus this is correct
955 even if there is overflow in the computation. */
956 niter
= expand_simple_binop (desc
->mode
, AND
,
958 GEN_INT (max_unroll
),
959 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
961 init_code
= get_insns ();
964 /* Precondition the loop. */
965 loop_split_edge_with (loop_preheader_edge (loop
), init_code
);
967 remove_edges
= xcalloc (max_unroll
+ n_peel
+ 1, sizeof (edge
));
970 wont_exit
= sbitmap_alloc (max_unroll
+ 2);
972 /* Peel the first copy of loop body (almost always we must leave exit test
973 here; the only exception is when we have extra zero check and the number
974 of iterations is reliable. Also record the place of (possible) extra
976 sbitmap_zero (wont_exit
);
978 && !desc
->noloop_assumptions
)
979 SET_BIT (wont_exit
, 1);
980 ezc_swtch
= loop_preheader_edge (loop
)->src
;
981 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
983 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
984 DLTHE_FLAG_UPDATE_FREQ
))
987 /* Record the place where switch will be built for preconditioning. */
988 swtch
= loop_split_edge_with (loop_preheader_edge (loop
),
991 for (i
= 0; i
< n_peel
; i
++)
994 sbitmap_zero (wont_exit
);
995 if (i
!= n_peel
- 1 || !last_may_exit
)
996 SET_BIT (wont_exit
, 1);
997 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
999 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
1000 DLTHE_FLAG_UPDATE_FREQ
))
1003 /* Create item for switch. */
1004 j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
1005 p
= REG_BR_PROB_BASE
/ (i
+ 2);
1007 preheader
= loop_split_edge_with (loop_preheader_edge (loop
), NULL_RTX
);
1008 branch_code
= compare_and_jump_seq (copy_rtx (niter
), GEN_INT (j
), EQ
,
1009 block_label (preheader
), p
, NULL_RTX
);
1011 swtch
= loop_split_edge_with (swtch
->pred
, branch_code
);
1012 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1013 swtch
->succ
->probability
= REG_BR_PROB_BASE
- p
;
1014 e
= make_edge (swtch
, preheader
,
1015 swtch
->succ
->flags
& EDGE_IRREDUCIBLE_LOOP
);
1019 if (extra_zero_check
)
1021 /* Add branch for zero iterations. */
1022 p
= REG_BR_PROB_BASE
/ (max_unroll
+ 1);
1024 preheader
= loop_split_edge_with (loop_preheader_edge (loop
), NULL_RTX
);
1025 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
1026 block_label (preheader
), p
, NULL_RTX
);
1028 swtch
= loop_split_edge_with (swtch
->succ
, branch_code
);
1029 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1030 swtch
->succ
->probability
= REG_BR_PROB_BASE
- p
;
1031 e
= make_edge (swtch
, preheader
,
1032 swtch
->succ
->flags
& EDGE_IRREDUCIBLE_LOOP
);
1036 /* Recount dominators for outer blocks. */
1037 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, n_dom_bbs
);
1039 /* And unroll loop. */
1041 sbitmap_ones (wont_exit
);
1042 RESET_BIT (wont_exit
, may_exit_copy
);
1044 si_info_start_duplication (si_info
);
1045 if (!duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1047 wont_exit
, desc
->out_edge
, remove_edges
, &n_remove_edges
,
1048 DLTHE_FLAG_UPDATE_FREQ
))
1053 split_ivs_in_copies (si_info
, max_unroll
, true, true);
1054 free_si_info (si_info
);
1061 basic_block exit_block
= desc
->in_edge
->src
->rbi
->copy
;
1062 /* Find a new in and out edge; they are in the last copy we have made. */
1064 if (exit_block
->succ
->dest
== desc
->out_edge
->dest
)
1066 desc
->out_edge
= exit_block
->succ
;
1067 desc
->in_edge
= exit_block
->succ
->succ_next
;
1071 desc
->out_edge
= exit_block
->succ
->succ_next
;
1072 desc
->in_edge
= exit_block
->succ
;
1076 /* Remove the edges. */
1077 for (i
= 0; i
< n_remove_edges
; i
++)
1078 remove_path (loops
, remove_edges
[i
]);
1079 free (remove_edges
);
1081 /* We must be careful when updating the number of iterations due to
1082 preconditioning and the fact that the value must be valid at entry
1083 of the loop. After passing through the above code, we see that
1084 the correct new number of iterations is this: */
1085 gcc_assert (!desc
->const_iter
);
1087 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
, GEN_INT (max_unroll
+ 1));
1088 desc
->niter_max
/= max_unroll
+ 1;
1092 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1093 desc
->noloop_assumptions
= NULL_RTX
;
1099 ";; Unrolled loop %d times, counting # of iterations "
1100 "in runtime, %i insns\n",
1101 max_unroll
, num_loop_insns (loop
));
1104 /* Decide whether to simply peel LOOP and how much. */
1106 decide_peel_simple (struct loop
*loop
, int flags
)
1109 struct niter_desc
*desc
;
1111 if (!(flags
& UAP_PEEL
))
1113 /* We were not asked to, just return back silently. */
1118 fprintf (dump_file
, "\n;; Considering simply peeling loop\n");
1120 /* npeel = number of iterations to peel. */
1121 npeel
= PARAM_VALUE (PARAM_MAX_PEELED_INSNS
) / loop
->ninsns
;
1122 if (npeel
> (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES
))
1123 npeel
= PARAM_VALUE (PARAM_MAX_PEEL_TIMES
);
1125 /* Skip big loops. */
1129 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1133 /* Check for simple loops. */
1134 desc
= get_simple_loop_desc (loop
);
1136 /* Check number of iterations. */
1137 if (desc
->simple_p
&& !desc
->assumptions
&& desc
->const_iter
)
1140 fprintf (dump_file
, ";; Loop iterates constant times\n");
1144 /* Do not simply peel loops with branches inside -- it increases number
1146 if (num_loop_branches (loop
) > 1)
1149 fprintf (dump_file
, ";; Not peeling, contains branches\n");
1153 if (loop
->header
->count
)
1155 unsigned niter
= expected_loop_iterations (loop
);
1156 if (niter
+ 1 > npeel
)
1160 fprintf (dump_file
, ";; Not peeling loop, rolls too much (");
1161 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
1162 (HOST_WIDEST_INT
) (niter
+ 1));
1163 fprintf (dump_file
, " iterations > %d [maximum peelings])\n",
1172 /* For now we have no good heuristics to decide whether loop peeling
1173 will be effective, so disable it. */
1176 ";; Not peeling loop, no evidence it will be profitable\n");
1181 loop
->lpt_decision
.decision
= LPT_PEEL_SIMPLE
;
1182 loop
->lpt_decision
.times
= npeel
;
1185 fprintf (dump_file
, ";; Decided to simply peel the loop, %d times.\n",
1186 loop
->lpt_decision
.times
);
1189 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1195 if (!cond) goto end;
1197 if (!cond) goto end;
1204 peel_loop_simple (struct loops
*loops
, struct loop
*loop
)
1207 unsigned npeel
= loop
->lpt_decision
.times
;
1208 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1209 struct split_ivs_info
*si_info
= NULL
;
1211 if (flag_split_ivs_in_unroller
&& npeel
> 1)
1212 si_info
= analyze_ivs_to_split (loop
);
1214 wont_exit
= sbitmap_alloc (npeel
+ 1);
1215 sbitmap_zero (wont_exit
);
1217 si_info_start_duplication (si_info
);
1218 if (!duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1219 loops
, npeel
, wont_exit
, NULL
, NULL
, NULL
,
1220 DLTHE_FLAG_UPDATE_FREQ
))
1227 split_ivs_in_copies (si_info
, npeel
, false, false);
1228 free_si_info (si_info
);
1233 if (desc
->const_iter
)
1235 desc
->niter
-= npeel
;
1236 desc
->niter_expr
= GEN_INT (desc
->niter
);
1237 desc
->noloop_assumptions
= NULL_RTX
;
1241 /* We cannot just update niter_expr, as its value might be clobbered
1242 inside loop. We could handle this by counting the number into
1243 temporary just like we do in runtime unrolling, but it does not
1245 free_simple_loop_desc (loop
);
1249 fprintf (dump_file
, ";; Peeling loop %d times\n", npeel
);
1252 /* Decide whether to unroll LOOP stupidly and how much. */
1254 decide_unroll_stupid (struct loop
*loop
, int flags
)
1256 unsigned nunroll
, nunroll_by_av
, i
;
1257 struct niter_desc
*desc
;
1259 if (!(flags
& UAP_UNROLL_ALL
))
1261 /* We were not asked to, just return back silently. */
1266 fprintf (dump_file
, "\n;; Considering unrolling loop stupidly\n");
1268 /* nunroll = total number of copies of the original loop body in
1269 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1270 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
1272 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
1273 if (nunroll
> nunroll_by_av
)
1274 nunroll
= nunroll_by_av
;
1275 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
1276 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
1278 /* Skip big loops. */
1282 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1286 /* Check for simple loops. */
1287 desc
= get_simple_loop_desc (loop
);
1289 /* Check simpleness. */
1290 if (desc
->simple_p
&& !desc
->assumptions
)
1293 fprintf (dump_file
, ";; The loop is simple\n");
1297 /* Do not unroll loops with branches inside -- it increases number
1299 if (num_loop_branches (loop
) > 1)
1302 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1306 /* If we have profile feedback, check whether the loop rolls. */
1307 if (loop
->header
->count
1308 && expected_loop_iterations (loop
) < 2 * nunroll
)
1311 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1315 /* Success. Now force nunroll to be power of 2, as it seems that this
1316 improves results (partially because of better alignments, partially
1317 because of some dark magic). */
1318 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1321 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1322 loop
->lpt_decision
.times
= i
- 1;
1326 ";; Decided to unroll the loop stupidly, %d times.\n",
1327 loop
->lpt_decision
.times
);
1330 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1348 unroll_loop_stupid (struct loops
*loops
, struct loop
*loop
)
1351 unsigned nunroll
= loop
->lpt_decision
.times
;
1352 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1353 struct split_ivs_info
*si_info
= NULL
;
1355 if (flag_split_ivs_in_unroller
)
1356 si_info
= analyze_ivs_to_split (loop
);
1358 wont_exit
= sbitmap_alloc (nunroll
+ 1);
1359 sbitmap_zero (wont_exit
);
1361 si_info_start_duplication (si_info
);
1362 if (!duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1363 loops
, nunroll
, wont_exit
, NULL
, NULL
, NULL
,
1364 DLTHE_FLAG_UPDATE_FREQ
))
1369 split_ivs_in_copies (si_info
, nunroll
, true, true);
1370 free_si_info (si_info
);
1377 /* We indeed may get here provided that there are nontrivial assumptions
1378 for a loop to be really simple. We could update the counts, but the
1379 problem is that we are unable to decide which exit will be taken
1380 (not really true in case the number of iterations is constant,
1381 but noone will do anything with this information, so we do not
1383 desc
->simple_p
= false;
1387 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1388 nunroll
, num_loop_insns (loop
));
1391 /* A hash function for information about insns to split. */
1394 si_info_hash (const void *ivts
)
1396 return htab_hash_pointer (((struct iv_to_split
*) ivts
)->insn
);
1399 /* An equality functions for information about insns to split. */
1402 si_info_eq (const void *ivts1
, const void *ivts2
)
1404 const struct iv_to_split
*i1
= ivts1
;
1405 const struct iv_to_split
*i2
= ivts2
;
1407 return i1
->insn
== i2
->insn
;
1410 /* Determine whether there is an induction variable in INSN that
1411 we would like to split during unrolling. Return NULL if INSN
1412 contains no interesting IVs. Otherwise, allocate an IV_TO_SPLIT
1413 structure, fill it with the relevant information and return a
1416 static struct iv_to_split
*
1417 analyze_iv_to_split_insn (rtx insn
)
1421 struct iv_to_split
*ivts
;
1423 /* For now we just split the basic induction variables. Later this may be
1424 extended for example by selecting also addresses of memory references. */
1425 set
= single_set (insn
);
1429 dest
= SET_DEST (set
);
1433 if (!biv_p (insn
, dest
))
1436 if (!iv_analyze (insn
, dest
, &iv
))
1439 if (iv
.step
== const0_rtx
1440 || iv
.mode
!= iv
.extend_mode
)
1443 /* Record the insn to split. */
1444 ivts
= xmalloc (sizeof (struct iv_to_split
));
1446 ivts
->base_var
= NULL_RTX
;
1447 ivts
->step
= iv
.step
;
1454 /* Determines which of induction variables in LOOP to split.
1455 Return a SPLIT_IVS_INFO struct with the hash table filled
1456 with all insns to split IVs in. The FIRST_NEW_BLOCK field
1457 is undefined for the return value. */
1459 static struct split_ivs_info
*
1460 analyze_ivs_to_split (struct loop
*loop
)
1462 basic_block
*body
, bb
;
1464 struct split_ivs_info
*si_info
= xcalloc (1, sizeof (struct split_ivs_info
));
1466 struct iv_to_split
*ivts
;
1469 si_info
->insns_to_split
= htab_create (5 * loop
->num_nodes
,
1470 si_info_hash
, si_info_eq
, free
);
1472 iv_analysis_loop_init (loop
);
1474 body
= get_loop_body (loop
);
1475 for (i
= 0; i
< loop
->num_nodes
; i
++)
1478 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1481 FOR_BB_INSNS (bb
, insn
)
1486 ivts
= analyze_iv_to_split_insn (insn
);
1491 slot
= htab_find_slot (si_info
->insns_to_split
, ivts
, INSERT
);
1501 /* Called just before loop duplication. Records start of duplicated area
1505 si_info_start_duplication (struct split_ivs_info
*si_info
)
1508 si_info
->first_new_block
= last_basic_block
;
1511 /* Determine the number of iterations between initialization of the base
1512 variable and the current copy (N_COPY). N_COPIES is the total number
1513 of newly created copies. UNROLLING is true if we are unrolling
1514 (not peeling) the loop. */
1517 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1521 /* If we are unrolling, initialization is done in the original loop
1527 /* If we are peeling, the copy in that the initialization occurs has
1528 number 1. The original loop (number 0) is the last. */
1536 /* Locate in EXPR the expression corresponding to the location recorded
1537 in IVTS, and return a pointer to the RTX for this location. */
1540 get_ivts_expr (rtx expr
, struct iv_to_split
*ivts
)
1545 for (i
= 0; i
< ivts
->n_loc
; i
++)
1546 ret
= &XEXP (*ret
, ivts
->loc
[i
]);
1551 /* Allocate basic variable for the induction variable chain. Callback for
1555 allocate_basic_variable (void **slot
, void *data ATTRIBUTE_UNUSED
)
1557 struct iv_to_split
*ivts
= *slot
;
1558 rtx expr
= *get_ivts_expr (single_set (ivts
->insn
), ivts
);
1560 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1565 /* Insert initialization of basic variable of IVTS before INSN, taking
1566 the initial value from INSN. */
1569 insert_base_initialization (struct iv_to_split
*ivts
, rtx insn
)
1571 rtx expr
= copy_rtx (*get_ivts_expr (single_set (insn
), ivts
));
1575 expr
= force_operand (expr
, ivts
->base_var
);
1576 if (expr
!= ivts
->base_var
)
1577 emit_move_insn (ivts
->base_var
, expr
);
1581 emit_insn_before (seq
, insn
);
1584 /* Replace the use of induction variable described in IVTS in INSN
1585 by base variable + DELTA * step. */
1588 split_iv (struct iv_to_split
*ivts
, rtx insn
, unsigned delta
)
1590 rtx expr
, *loc
, seq
, incr
, var
;
1591 enum machine_mode mode
= GET_MODE (ivts
->base_var
);
1594 /* Construct base + DELTA * step. */
1596 expr
= ivts
->base_var
;
1599 incr
= simplify_gen_binary (MULT
, mode
,
1600 ivts
->step
, gen_int_mode (delta
, mode
));
1601 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1602 ivts
->base_var
, incr
);
1605 /* Figure out where to do the replacement. */
1606 loc
= get_ivts_expr (single_set (insn
), ivts
);
1608 /* If we can make the replacement right away, we're done. */
1609 if (validate_change (insn
, loc
, expr
, 0))
1612 /* Otherwise, force EXPR into a register and try again. */
1614 var
= gen_reg_rtx (mode
);
1615 expr
= force_operand (expr
, var
);
1617 emit_move_insn (var
, expr
);
1620 emit_insn_before (seq
, insn
);
1622 if (validate_change (insn
, loc
, var
, 0))
1625 /* The last chance. Try recreating the assignment in insn
1626 completely from scratch. */
1627 set
= single_set (insn
);
1632 src
= copy_rtx (SET_SRC (set
));
1633 dest
= copy_rtx (SET_DEST (set
));
1634 src
= force_operand (src
, dest
);
1636 emit_move_insn (dest
, src
);
1640 emit_insn_before (seq
, insn
);
1644 /* Splits induction variables (that are marked in SI_INFO) in copies of loop.
1663 UNROLLING is true if we unrolled (not peeled) the loop.
1664 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1665 the loop (as it should happen in complete unrolling, but not in ordinary
1666 peeling of the loop). */
1669 split_ivs_in_copies (struct split_ivs_info
*si_info
, unsigned n_copies
,
1670 bool unrolling
, bool rewrite_original_loop
)
1673 basic_block bb
, orig_bb
;
1674 rtx insn
, orig_insn
, next
;
1675 struct iv_to_split ivts_templ
, *ivts
;
1677 /* Sanity check -- we need to put initialization in the original loop
1679 gcc_assert (!unrolling
|| rewrite_original_loop
);
1681 /* Allocate the basic variables (i0). */
1682 htab_traverse (si_info
->insns_to_split
, allocate_basic_variable
, NULL
);
1684 for (i
= si_info
->first_new_block
; i
< (unsigned) last_basic_block
; i
++)
1686 bb
= BASIC_BLOCK (i
);
1687 orig_bb
= bb
->rbi
->original
;
1689 delta
= determine_split_iv_delta (bb
->rbi
->copy_number
, n_copies
,
1691 orig_insn
= BB_HEAD (orig_bb
);
1692 for (insn
= BB_HEAD (bb
); insn
!= NEXT_INSN (BB_END (bb
)); insn
= next
)
1694 next
= NEXT_INSN (insn
);
1698 while (!INSN_P (orig_insn
))
1699 orig_insn
= NEXT_INSN (orig_insn
);
1701 ivts_templ
.insn
= orig_insn
;
1702 ivts
= htab_find (si_info
->insns_to_split
, &ivts_templ
);
1706 #ifdef ENABLE_CHECKING
1707 if (!rtx_equal_p (PATTERN (insn
), PATTERN (orig_insn
)))
1712 insert_base_initialization (ivts
, insn
);
1713 split_iv (ivts
, insn
, delta
);
1715 orig_insn
= NEXT_INSN (orig_insn
);
1719 if (!rewrite_original_loop
)
1722 /* Rewrite also the original loop body. Find them as originals of the blocks
1723 in the last copied iteration, i.e. those that have
1724 bb->rbi->original->copy == bb. */
1725 for (i
= si_info
->first_new_block
; i
< (unsigned) last_basic_block
; i
++)
1727 bb
= BASIC_BLOCK (i
);
1728 orig_bb
= bb
->rbi
->original
;
1729 if (orig_bb
->rbi
->copy
!= bb
)
1732 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
1733 for (orig_insn
= BB_HEAD (orig_bb
);
1734 orig_insn
!= NEXT_INSN (BB_END (bb
));
1737 next
= NEXT_INSN (orig_insn
);
1739 if (!INSN_P (orig_insn
))
1742 ivts_templ
.insn
= orig_insn
;
1743 ivts
= htab_find (si_info
->insns_to_split
, &ivts_templ
);
1748 insert_base_initialization (ivts
, orig_insn
);
1749 split_iv (ivts
, orig_insn
, delta
);
1754 /* Release SI_INFO. */
1757 free_si_info (struct split_ivs_info
*si_info
)
1759 htab_delete (si_info
->insns_to_split
);