1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008
3 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
26 #include "hard-reg-set.h"
28 #include "basic-block.h"
30 #include "cfglayout.h"
38 /* This pass performs loop unrolling and peeling. We only perform these
39 optimizations on innermost loops (with single exception) because
40 the impact on performance is greatest here, and we want to avoid
41 unnecessary code size growth. The gain is caused by greater sequentiality
42 of code, better code to optimize for further passes and in some cases
43 by fewer testings of exit conditions. The main problem is code growth,
44 that impacts performance negatively due to effect of caches.
48 -- complete peeling of once-rolling loops; this is the above mentioned
49 exception, as this causes loop to be cancelled completely and
50 does not cause code growth
51 -- complete peeling of loops that roll (small) constant times.
52 -- simple peeling of first iterations of loops that do not roll much
53 (according to profile feedback)
54 -- unrolling of loops that roll constant times; this is almost always
55 win, as we get rid of exit condition tests.
56 -- unrolling of loops that roll number of times that we can compute
57 in runtime; we also get rid of exit condition tests here, but there
58 is the extra expense for calculating the number of iterations
59 -- simple unrolling of remaining loops; this is performed only if we
60 are asked to, as the gain is questionable in this case and often
61 it may even slow down the code
62 For more detailed descriptions of each of those, see comments at
63 appropriate function below.
65 There is a lot of parameters (defined and described in params.def) that
66 control how much we unroll/peel.
68 ??? A great problem is that we don't have a good way how to determine
69 how many times we should unroll the loop; the experiments I have made
70 showed that this choice may affect performance in order of several %.
73 /* Information about induction variables to split. */
77 rtx insn
; /* The insn in that the induction variable occurs. */
78 rtx base_var
; /* The variable on that the values in the further
79 iterations are based. */
80 rtx step
; /* Step of the induction variable. */
81 struct iv_to_split
*next
; /* Next entry in walking order. */
83 unsigned loc
[3]; /* Location where the definition of the induction
84 variable occurs in the insn. For example if
85 N_LOC is 2, the expression is located at
86 XEXP (XEXP (single_set, loc[0]), loc[1]). */
89 /* Information about accumulators to expand. */
93 rtx insn
; /* The insn in that the variable expansion occurs. */
94 rtx reg
; /* The accumulator which is expanded. */
95 VEC(rtx
,heap
) *var_expansions
; /* The copies of the accumulator which is expanded. */
96 struct var_to_expand
*next
; /* Next entry in walking order. */
97 enum rtx_code op
; /* The type of the accumulation - addition, subtraction
99 int expansion_count
; /* Count the number of expansions generated so far. */
100 int reuse_expansion
; /* The expansion we intend to reuse to expand
101 the accumulator. If REUSE_EXPANSION is 0 reuse
102 the original accumulator. Else use
103 var_expansions[REUSE_EXPANSION - 1]. */
104 unsigned accum_pos
; /* The position in which the accumulator is placed in
105 the insn src. For example in x = x + something
106 accum_pos is 0 while in x = something + x accum_pos
110 /* Information about optimization applied in
111 the unrolled loop. */
115 htab_t insns_to_split
; /* A hashtable of insns to split. */
116 struct iv_to_split
*iv_to_split_head
; /* The first iv to split. */
117 struct iv_to_split
**iv_to_split_tail
; /* Pointer to the tail of the list. */
118 htab_t insns_with_var_to_expand
; /* A hashtable of insns with accumulators
120 struct var_to_expand
*var_to_expand_head
; /* The first var to expand. */
121 struct var_to_expand
**var_to_expand_tail
; /* Pointer to the tail of the list. */
122 unsigned first_new_block
; /* The first basic block that was
124 basic_block loop_exit
; /* The loop exit basic block. */
125 basic_block loop_preheader
; /* The loop preheader basic block. */
128 static void decide_unrolling_and_peeling (int);
129 static void peel_loops_completely (int);
130 static void decide_peel_simple (struct loop
*, int);
131 static void decide_peel_once_rolling (struct loop
*, int);
132 static void decide_peel_completely (struct loop
*, int);
133 static void decide_unroll_stupid (struct loop
*, int);
134 static void decide_unroll_constant_iterations (struct loop
*, int);
135 static void decide_unroll_runtime_iterations (struct loop
*, int);
136 static void peel_loop_simple (struct loop
*);
137 static void peel_loop_completely (struct loop
*);
138 static void unroll_loop_stupid (struct loop
*);
139 static void unroll_loop_constant_iterations (struct loop
*);
140 static void unroll_loop_runtime_iterations (struct loop
*);
141 static struct opt_info
*analyze_insns_in_loop (struct loop
*);
142 static void opt_info_start_duplication (struct opt_info
*);
143 static void apply_opt_in_copies (struct opt_info
*, unsigned, bool, bool);
144 static void free_opt_info (struct opt_info
*);
145 static struct var_to_expand
*analyze_insn_to_expand_var (struct loop
*, rtx
);
146 static bool referenced_in_one_insn_in_loop_p (struct loop
*, rtx
);
147 static struct iv_to_split
*analyze_iv_to_split_insn (rtx
);
148 static void expand_var_during_unrolling (struct var_to_expand
*, rtx
);
149 static void insert_var_expansion_initialization (struct var_to_expand
*,
151 static void combine_var_copies_in_loop_exit (struct var_to_expand
*,
153 static rtx
get_expansion (struct var_to_expand
*);
155 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
157 unroll_and_peel_loops (int flags
)
163 /* First perform complete loop peeling (it is almost surely a win,
164 and affects parameters for further decision a lot). */
165 peel_loops_completely (flags
);
167 /* Now decide rest of unrolling and peeling. */
168 decide_unrolling_and_peeling (flags
);
170 /* Scan the loops, inner ones first. */
171 FOR_EACH_LOOP (li
, loop
, LI_FROM_INNERMOST
)
175 /* And perform the appropriate transformations. */
176 switch (loop
->lpt_decision
.decision
)
178 case LPT_PEEL_COMPLETELY
:
181 case LPT_PEEL_SIMPLE
:
182 peel_loop_simple (loop
);
184 case LPT_UNROLL_CONSTANT
:
185 unroll_loop_constant_iterations (loop
);
187 case LPT_UNROLL_RUNTIME
:
188 unroll_loop_runtime_iterations (loop
);
190 case LPT_UNROLL_STUPID
:
191 unroll_loop_stupid (loop
);
201 #ifdef ENABLE_CHECKING
202 verify_dominators (CDI_DOMINATORS
);
203 verify_loop_structure ();
211 /* Check whether exit of the LOOP is at the end of loop body. */
214 loop_exit_at_end_p (struct loop
*loop
)
216 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
219 if (desc
->in_edge
->dest
!= loop
->latch
)
222 /* Check that the latch is empty. */
223 FOR_BB_INSNS (loop
->latch
, insn
)
232 /* Depending on FLAGS, check whether to peel loops completely and do so. */
234 peel_loops_completely (int flags
)
239 /* Scan the loops, the inner ones first. */
240 FOR_EACH_LOOP (li
, loop
, LI_FROM_INNERMOST
)
242 loop
->lpt_decision
.decision
= LPT_NONE
;
246 "\n;; *** Considering loop %d for complete peeling ***\n",
249 loop
->ninsns
= num_loop_insns (loop
);
251 decide_peel_once_rolling (loop
, flags
);
252 if (loop
->lpt_decision
.decision
== LPT_NONE
)
253 decide_peel_completely (loop
, flags
);
255 if (loop
->lpt_decision
.decision
== LPT_PEEL_COMPLETELY
)
257 peel_loop_completely (loop
);
258 #ifdef ENABLE_CHECKING
259 verify_dominators (CDI_DOMINATORS
);
260 verify_loop_structure ();
266 /* Decide whether unroll or peel loops (depending on FLAGS) and how much. */
268 decide_unrolling_and_peeling (int flags
)
273 /* ICI: parameter holders */
275 /* Scan the loops, inner ones first. */
276 FOR_EACH_LOOP (li
, loop
, LI_FROM_INNERMOST
)
278 loop
->lpt_decision
.decision
= LPT_NONE
;
281 fprintf (dump_file
, "\n;; *** Considering loop %d ***\n", loop
->num
);
284 /* Do not peel cold areas. */
285 if (optimize_loop_for_size_p (loop
))
287 static bool ici_optimize_loop_for_size_p
;
288 ici_optimize_loop_for_size_p
= 1;
289 invoke_plugin_va_callbacks
290 (PLUGIN_UNROLL_PARAMETER_HANDLER
291 "_loop.id", EP_INT
, &(loop
->num
),
292 "loop.optimize_loop_for_size_p", EP_UNSIGNED_CHAR
,
293 &ici_optimize_loop_for_size_p
, NULL
);
295 if (ici_optimize_loop_for_size_p
)
298 fprintf (dump_file
, ";; Not considering loop, cold area\n");
303 /* Can the loop be manipulated? */
304 if (!can_duplicate_loop_p (loop
))
306 static bool ici_can_duplicate_loop_p
;
307 ici_can_duplicate_loop_p
= 0;
308 invoke_plugin_va_callbacks
309 (PLUGIN_UNROLL_PARAMETER_HANDLER
310 "_loop.id", EP_INT
, &(loop
->num
),
311 "_loop.can_duplicate_loop_p", EP_UNSIGNED_CHAR
,
312 &ici_can_duplicate_loop_p
, NULL
);
316 ";; Not considering loop, cannot duplicate\n");
320 /* Skip non-innermost loops. */
323 static bool ici_is_inner_most
;
324 ici_is_inner_most
= 0;
325 invoke_plugin_va_callbacks
326 (PLUGIN_UNROLL_PARAMETER_HANDLER
327 "_loop.id", EP_INT
, &(loop
->num
),
328 "_loop.is_inner_most", EP_UNSIGNED_CHAR
, &ici_is_inner_most
,
333 ";; Not considering loop, is not innermost\n");
337 loop
->ninsns
= num_loop_insns (loop
);
338 loop
->av_ninsns
= average_num_loop_insns (loop
);
340 /* Try transformations one by one in decreasing order of
343 decide_unroll_constant_iterations (loop
, flags
);
344 if (loop
->lpt_decision
.decision
== LPT_NONE
)
345 decide_unroll_runtime_iterations (loop
, flags
);
346 if (loop
->lpt_decision
.decision
== LPT_NONE
)
347 decide_unroll_stupid (loop
, flags
);
348 if (loop
->lpt_decision
.decision
== LPT_NONE
)
349 decide_peel_simple (loop
, flags
);
351 invoke_plugin_va_callbacks
352 (PLUGIN_UNROLL_PARAMETER_HANDLER
353 "_loop.id", EP_INT
, &(loop
->num
),
354 /* ICI: Number of loop insns. */
355 "_loop.ninsns", EP_UNSIGNED
, &(loop
->ninsns
),
356 /* ICI: Average number of executed insns per iteration. */
357 "_loop.av_ninsns", EP_UNSIGNED
, &(loop
->av_ninsns
),
358 /* ICI: Number of blocks contained within the loop. */
359 "_loop.num_nodes", EP_UNSIGNED
, &(loop
->num_nodes
),
360 /* ICI: Loop unrolling/peeling decision. */
361 "loop.lpt_decision.times", EP_UNSIGNED
, &(loop
->lpt_decision
.times
),
362 "loop.lpt_decision.decision", EP_INT
, /* Enum treated as int. */
363 &(loop
->lpt_decision
.decision
), NULL
);
367 /* Decide whether the LOOP is once rolling and suitable for complete
370 decide_peel_once_rolling (struct loop
*loop
, int flags ATTRIBUTE_UNUSED
)
372 struct niter_desc
*desc
;
375 fprintf (dump_file
, "\n;; Considering peeling once rolling loop\n");
377 /* Is the loop small enough? */
378 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS
) < loop
->ninsns
)
381 fprintf (dump_file
, ";; Not considering loop, is too big\n");
385 /* Check for simple loops. */
386 desc
= get_simple_loop_desc (loop
);
388 /* Check number of iterations. */
397 ";; Unable to prove that the loop rolls exactly once\n");
403 fprintf (dump_file
, ";; Decided to peel exactly once rolling loop\n");
404 loop
->lpt_decision
.decision
= LPT_PEEL_COMPLETELY
;
407 /* Decide whether the LOOP is suitable for complete peeling. */
409 decide_peel_completely (struct loop
*loop
, int flags ATTRIBUTE_UNUSED
)
412 struct niter_desc
*desc
;
415 fprintf (dump_file
, "\n;; Considering peeling completely\n");
417 /* Skip non-innermost loops. */
421 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
425 /* Do not peel cold areas. */
426 if (optimize_loop_for_size_p (loop
))
429 fprintf (dump_file
, ";; Not considering loop, cold area\n");
433 /* Can the loop be manipulated? */
434 if (!can_duplicate_loop_p (loop
))
438 ";; Not considering loop, cannot duplicate\n");
442 /* npeel = number of iterations to peel. */
443 npeel
= PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS
) / loop
->ninsns
;
444 if (npeel
> (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES
))
445 npeel
= PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES
);
447 /* Is the loop small enough? */
451 fprintf (dump_file
, ";; Not considering loop, is too big\n");
455 /* Check for simple loops. */
456 desc
= get_simple_loop_desc (loop
);
458 /* Check number of iterations. */
466 ";; Unable to prove that the loop iterates constant times\n");
470 if (desc
->niter
> npeel
- 1)
475 ";; Not peeling loop completely, rolls too much (");
476 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
, desc
->niter
);
477 fprintf (dump_file
, " iterations > %d [maximum peelings])\n", npeel
);
484 fprintf (dump_file
, ";; Decided to peel loop completely\n");
485 loop
->lpt_decision
.decision
= LPT_PEEL_COMPLETELY
;
488 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
489 completely. The transformation done:
491 for (i = 0; i < 4; i++)
503 peel_loop_completely (struct loop
*loop
)
506 unsigned HOST_WIDE_INT npeel
;
508 VEC (edge
, heap
) *remove_edges
;
510 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
511 struct opt_info
*opt_info
= NULL
;
519 wont_exit
= sbitmap_alloc (npeel
+ 1);
520 sbitmap_ones (wont_exit
);
521 RESET_BIT (wont_exit
, 0);
522 if (desc
->noloop_assumptions
)
523 RESET_BIT (wont_exit
, 1);
527 if (flag_split_ivs_in_unroller
)
528 opt_info
= analyze_insns_in_loop (loop
);
530 opt_info_start_duplication (opt_info
);
531 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
533 wont_exit
, desc
->out_edge
,
535 DLTHE_FLAG_UPDATE_FREQ
536 | DLTHE_FLAG_COMPLETTE_PEEL
538 ? DLTHE_RECORD_COPY_NUMBER
: 0));
545 apply_opt_in_copies (opt_info
, npeel
, false, true);
546 free_opt_info (opt_info
);
549 /* Remove the exit edges. */
550 for (i
= 0; VEC_iterate (edge
, remove_edges
, i
, ein
); i
++)
552 VEC_free (edge
, heap
, remove_edges
);
556 free_simple_loop_desc (loop
);
558 /* Now remove the unreachable part of the last iteration and cancel
563 fprintf (dump_file
, ";; Peeled loop completely, %d times\n", (int) npeel
);
566 /* Decide whether to unroll LOOP iterating constant number of times
570 decide_unroll_constant_iterations (struct loop
*loop
, int flags
)
572 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
573 struct niter_desc
*desc
;
575 if (!(flags
& UAP_UNROLL
))
577 /* We were not asked to, just return back silently. */
583 "\n;; Considering unrolling loop with constant "
584 "number of iterations\n");
586 /* nunroll = total number of copies of the original loop body in
587 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
588 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
590 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
591 if (nunroll
> nunroll_by_av
)
592 nunroll
= nunroll_by_av
;
593 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
594 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
596 /* Skip big loops. */
600 fprintf (dump_file
, ";; Not considering loop, is too big\n");
604 /* Check for simple loops. */
605 desc
= get_simple_loop_desc (loop
);
607 /* Check number of iterations. */
608 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
612 ";; Unable to prove that the loop iterates constant times\n");
616 /* Check whether the loop rolls enough to consider. */
617 if (desc
->niter
< 2 * nunroll
)
620 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
624 /* Success; now compute number of iterations to unroll. We alter
625 nunroll so that as few as possible copies of loop body are
626 necessary, while still not decreasing the number of unrollings
627 too much (at most by 1). */
628 best_copies
= 2 * nunroll
+ 10;
631 if (i
- 1 >= desc
->niter
)
634 for (; i
>= nunroll
- 1; i
--)
636 unsigned exit_mod
= desc
->niter
% (i
+ 1);
638 if (!loop_exit_at_end_p (loop
))
639 n_copies
= exit_mod
+ i
+ 1;
640 else if (exit_mod
!= (unsigned) i
641 || desc
->noloop_assumptions
!= NULL_RTX
)
642 n_copies
= exit_mod
+ i
+ 2;
646 if (n_copies
< best_copies
)
648 best_copies
= n_copies
;
654 fprintf (dump_file
, ";; max_unroll %d (%d copies, initial %d).\n",
655 best_unroll
+ 1, best_copies
, nunroll
);
657 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
658 loop
->lpt_decision
.times
= best_unroll
;
662 ";; Decided to unroll the constant times rolling loop, %d times.\n",
663 loop
->lpt_decision
.times
);
666 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
667 times. The transformation does this:
669 for (i = 0; i < 102; i++)
686 unroll_loop_constant_iterations (struct loop
*loop
)
688 unsigned HOST_WIDE_INT niter
;
692 VEC (edge
, heap
) *remove_edges
;
694 unsigned max_unroll
= loop
->lpt_decision
.times
;
695 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
696 bool exit_at_end
= loop_exit_at_end_p (loop
);
697 struct opt_info
*opt_info
= NULL
;
702 /* Should not get here (such loop should be peeled instead). */
703 gcc_assert (niter
> max_unroll
+ 1);
705 exit_mod
= niter
% (max_unroll
+ 1);
707 wont_exit
= sbitmap_alloc (max_unroll
+ 1);
708 sbitmap_ones (wont_exit
);
711 if (flag_split_ivs_in_unroller
712 || flag_variable_expansion_in_unroller
)
713 opt_info
= analyze_insns_in_loop (loop
);
717 /* The exit is not at the end of the loop; leave exit test
718 in the first copy, so that the loops that start with test
719 of exit condition have continuous body after unrolling. */
722 fprintf (dump_file
, ";; Condition on beginning of loop.\n");
724 /* Peel exit_mod iterations. */
725 RESET_BIT (wont_exit
, 0);
726 if (desc
->noloop_assumptions
)
727 RESET_BIT (wont_exit
, 1);
731 opt_info_start_duplication (opt_info
);
732 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
734 wont_exit
, desc
->out_edge
,
736 DLTHE_FLAG_UPDATE_FREQ
737 | (opt_info
&& exit_mod
> 1
738 ? DLTHE_RECORD_COPY_NUMBER
742 if (opt_info
&& exit_mod
> 1)
743 apply_opt_in_copies (opt_info
, exit_mod
, false, false);
745 desc
->noloop_assumptions
= NULL_RTX
;
746 desc
->niter
-= exit_mod
;
747 desc
->niter_max
-= exit_mod
;
750 SET_BIT (wont_exit
, 1);
754 /* Leave exit test in last copy, for the same reason as above if
755 the loop tests the condition at the end of loop body. */
758 fprintf (dump_file
, ";; Condition on end of loop.\n");
760 /* We know that niter >= max_unroll + 2; so we do not need to care of
761 case when we would exit before reaching the loop. So just peel
762 exit_mod + 1 iterations. */
763 if (exit_mod
!= max_unroll
764 || desc
->noloop_assumptions
)
766 RESET_BIT (wont_exit
, 0);
767 if (desc
->noloop_assumptions
)
768 RESET_BIT (wont_exit
, 1);
770 opt_info_start_duplication (opt_info
);
771 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
773 wont_exit
, desc
->out_edge
,
775 DLTHE_FLAG_UPDATE_FREQ
776 | (opt_info
&& exit_mod
> 0
777 ? DLTHE_RECORD_COPY_NUMBER
781 if (opt_info
&& exit_mod
> 0)
782 apply_opt_in_copies (opt_info
, exit_mod
+ 1, false, false);
784 desc
->niter
-= exit_mod
+ 1;
785 desc
->niter_max
-= exit_mod
+ 1;
786 desc
->noloop_assumptions
= NULL_RTX
;
788 SET_BIT (wont_exit
, 0);
789 SET_BIT (wont_exit
, 1);
792 RESET_BIT (wont_exit
, max_unroll
);
795 /* Now unroll the loop. */
797 opt_info_start_duplication (opt_info
);
798 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
800 wont_exit
, desc
->out_edge
,
802 DLTHE_FLAG_UPDATE_FREQ
804 ? DLTHE_RECORD_COPY_NUMBER
810 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
811 free_opt_info (opt_info
);
818 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
819 /* Find a new in and out edge; they are in the last copy we have made. */
821 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
823 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
824 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
828 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
829 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
833 desc
->niter
/= max_unroll
+ 1;
834 desc
->niter_max
/= max_unroll
+ 1;
835 desc
->niter_expr
= GEN_INT (desc
->niter
);
837 /* Remove the edges. */
838 for (i
= 0; VEC_iterate (edge
, remove_edges
, i
, e
); i
++)
840 VEC_free (edge
, heap
, remove_edges
);
844 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
845 max_unroll
, num_loop_insns (loop
));
848 /* Decide whether to unroll LOOP iterating runtime computable number of times
851 decide_unroll_runtime_iterations (struct loop
*loop
, int flags
)
853 unsigned nunroll
, nunroll_by_av
, i
;
854 struct niter_desc
*desc
;
856 if (!(flags
& UAP_UNROLL
))
858 /* We were not asked to, just return back silently. */
864 "\n;; Considering unrolling loop with runtime "
865 "computable number of iterations\n");
867 /* nunroll = total number of copies of the original loop body in
868 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
869 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
870 nunroll_by_av
= PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
871 if (nunroll
> nunroll_by_av
)
872 nunroll
= nunroll_by_av
;
873 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
874 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
876 /* Skip big loops. */
880 fprintf (dump_file
, ";; Not considering loop, is too big\n");
884 /* Check for simple loops. */
885 desc
= get_simple_loop_desc (loop
);
887 /* Check simpleness. */
888 if (!desc
->simple_p
|| desc
->assumptions
)
892 ";; Unable to prove that the number of iterations "
893 "can be counted in runtime\n");
897 if (desc
->const_iter
)
900 fprintf (dump_file
, ";; Loop iterates constant times\n");
904 /* If we have profile feedback, check whether the loop rolls. */
905 if (loop
->header
->count
&& expected_loop_iterations (loop
) < 2 * nunroll
)
908 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
912 /* Success; now force nunroll to be power of 2, as we are unable to
913 cope with overflows in computation of number of iterations. */
914 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
917 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
918 loop
->lpt_decision
.times
= i
- 1;
922 ";; Decided to unroll the runtime computable "
923 "times rolling loop, %d times.\n",
924 loop
->lpt_decision
.times
);
927 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
928 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
929 and NULL is returned instead. */
932 split_edge_and_insert (edge e
, rtx insns
)
939 emit_insn_after (insns
, BB_END (bb
));
941 /* ??? We used to assume that INSNS can contain control flow insns, and
942 that we had to try to find sub basic blocks in BB to maintain a valid
943 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
944 and call break_superblocks when going out of cfglayout mode. But it
945 turns out that this never happens; and that if it does ever happen,
946 the verify_flow_info call in loop_optimizer_finalize would fail.
948 There are two reasons why we expected we could have control flow insns
949 in INSNS. The first is when a comparison has to be done in parts, and
950 the second is when the number of iterations is computed for loops with
951 the number of iterations known at runtime. In both cases, test cases
952 to get control flow in INSNS appear to be impossible to construct:
954 * If do_compare_rtx_and_jump needs several branches to do comparison
955 in a mode that needs comparison by parts, we cannot analyze the
956 number of iterations of the loop, and we never get to unrolling it.
958 * The code in expand_divmod that was suspected to cause creation of
959 branching code seems to be only accessed for signed division. The
960 divisions used by # of iterations analysis are always unsigned.
961 Problems might arise on architectures that emits branching code
962 for some operations that may appear in the unroller (especially
963 for division), but we have no such architectures.
965 Considering all this, it was decided that we should for now assume
966 that INSNS can in theory contain control flow insns, but in practice
967 it never does. So we don't handle the theoretical case, and should
968 a real failure ever show up, we have a pretty good clue for how to
974 /* Unroll LOOP for that we are able to count number of iterations in runtime
975 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
976 extra care for case n < 0):
978 for (i = 0; i < n; i++)
1006 unroll_loop_runtime_iterations (struct loop
*loop
)
1008 rtx old_niter
, niter
, init_code
, branch_code
, tmp
;
1010 basic_block preheader
, *body
, swtch
, ezc_swtch
;
1011 VEC (basic_block
, heap
) *dom_bbs
;
1015 VEC (edge
, heap
) *remove_edges
;
1017 bool extra_zero_check
, last_may_exit
;
1018 unsigned max_unroll
= loop
->lpt_decision
.times
;
1019 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1020 bool exit_at_end
= loop_exit_at_end_p (loop
);
1021 struct opt_info
*opt_info
= NULL
;
1024 if (flag_split_ivs_in_unroller
1025 || flag_variable_expansion_in_unroller
)
1026 opt_info
= analyze_insns_in_loop (loop
);
1028 /* Remember blocks whose dominators will have to be updated. */
1031 body
= get_loop_body (loop
);
1032 for (i
= 0; i
< loop
->num_nodes
; i
++)
1034 VEC (basic_block
, heap
) *ldom
;
1037 ldom
= get_dominated_by (CDI_DOMINATORS
, body
[i
]);
1038 for (j
= 0; VEC_iterate (basic_block
, ldom
, j
, bb
); j
++)
1039 if (!flow_bb_inside_loop_p (loop
, bb
))
1040 VEC_safe_push (basic_block
, heap
, dom_bbs
, bb
);
1042 VEC_free (basic_block
, heap
, ldom
);
1048 /* Leave exit in first copy (for explanation why see comment in
1049 unroll_loop_constant_iterations). */
1051 n_peel
= max_unroll
- 1;
1052 extra_zero_check
= true;
1053 last_may_exit
= false;
1057 /* Leave exit in last copy (for explanation why see comment in
1058 unroll_loop_constant_iterations). */
1059 may_exit_copy
= max_unroll
;
1060 n_peel
= max_unroll
;
1061 extra_zero_check
= false;
1062 last_may_exit
= true;
1065 /* Get expression for number of iterations. */
1067 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
1068 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
1070 emit_move_insn (niter
, tmp
);
1072 /* Count modulo by ANDing it with max_unroll; we use the fact that
1073 the number of unrollings is a power of two, and thus this is correct
1074 even if there is overflow in the computation. */
1075 niter
= expand_simple_binop (desc
->mode
, AND
,
1077 GEN_INT (max_unroll
),
1078 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
1080 init_code
= get_insns ();
1082 unshare_all_rtl_in_chain (init_code
);
1084 /* Precondition the loop. */
1085 split_edge_and_insert (loop_preheader_edge (loop
), init_code
);
1087 remove_edges
= NULL
;
1089 wont_exit
= sbitmap_alloc (max_unroll
+ 2);
1091 /* Peel the first copy of loop body (almost always we must leave exit test
1092 here; the only exception is when we have extra zero check and the number
1093 of iterations is reliable. Also record the place of (possible) extra
1095 sbitmap_zero (wont_exit
);
1096 if (extra_zero_check
1097 && !desc
->noloop_assumptions
)
1098 SET_BIT (wont_exit
, 1);
1099 ezc_swtch
= loop_preheader_edge (loop
)->src
;
1100 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1101 1, wont_exit
, desc
->out_edge
,
1103 DLTHE_FLAG_UPDATE_FREQ
);
1106 /* Record the place where switch will be built for preconditioning. */
1107 swtch
= split_edge (loop_preheader_edge (loop
));
1109 for (i
= 0; i
< n_peel
; i
++)
1111 /* Peel the copy. */
1112 sbitmap_zero (wont_exit
);
1113 if (i
!= n_peel
- 1 || !last_may_exit
)
1114 SET_BIT (wont_exit
, 1);
1115 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1116 1, wont_exit
, desc
->out_edge
,
1118 DLTHE_FLAG_UPDATE_FREQ
);
1121 /* Create item for switch. */
1122 j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
1123 p
= REG_BR_PROB_BASE
/ (i
+ 2);
1125 preheader
= split_edge (loop_preheader_edge (loop
));
1126 branch_code
= compare_and_jump_seq (copy_rtx (niter
), GEN_INT (j
), EQ
,
1127 block_label (preheader
), p
,
1130 /* We rely on the fact that the compare and jump cannot be optimized out,
1131 and hence the cfg we create is correct. */
1132 gcc_assert (branch_code
!= NULL_RTX
);
1134 swtch
= split_edge_and_insert (single_pred_edge (swtch
), branch_code
);
1135 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1136 single_pred_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
1137 e
= make_edge (swtch
, preheader
,
1138 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1142 if (extra_zero_check
)
1144 /* Add branch for zero iterations. */
1145 p
= REG_BR_PROB_BASE
/ (max_unroll
+ 1);
1147 preheader
= split_edge (loop_preheader_edge (loop
));
1148 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
1149 block_label (preheader
), p
,
1151 gcc_assert (branch_code
!= NULL_RTX
);
1153 swtch
= split_edge_and_insert (single_succ_edge (swtch
), branch_code
);
1154 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1155 single_succ_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
1156 e
= make_edge (swtch
, preheader
,
1157 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1161 /* Recount dominators for outer blocks. */
1162 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, false);
1164 /* And unroll loop. */
1166 sbitmap_ones (wont_exit
);
1167 RESET_BIT (wont_exit
, may_exit_copy
);
1168 opt_info_start_duplication (opt_info
);
1170 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1172 wont_exit
, desc
->out_edge
,
1174 DLTHE_FLAG_UPDATE_FREQ
1176 ? DLTHE_RECORD_COPY_NUMBER
1182 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
1183 free_opt_info (opt_info
);
1190 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
1191 /* Find a new in and out edge; they are in the last copy we have
1194 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
1196 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
1197 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
1201 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
1202 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
1206 /* Remove the edges. */
1207 for (i
= 0; VEC_iterate (edge
, remove_edges
, i
, e
); i
++)
1209 VEC_free (edge
, heap
, remove_edges
);
1211 /* We must be careful when updating the number of iterations due to
1212 preconditioning and the fact that the value must be valid at entry
1213 of the loop. After passing through the above code, we see that
1214 the correct new number of iterations is this: */
1215 gcc_assert (!desc
->const_iter
);
1217 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
,
1218 GEN_INT (max_unroll
+ 1));
1219 desc
->niter_max
/= max_unroll
+ 1;
1223 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1224 desc
->noloop_assumptions
= NULL_RTX
;
1230 ";; Unrolled loop %d times, counting # of iterations "
1231 "in runtime, %i insns\n",
1232 max_unroll
, num_loop_insns (loop
));
1234 VEC_free (basic_block
, heap
, dom_bbs
);
1237 /* Decide whether to simply peel LOOP and how much. */
1239 decide_peel_simple (struct loop
*loop
, int flags
)
1242 struct niter_desc
*desc
;
1244 if (!(flags
& UAP_PEEL
))
1246 /* We were not asked to, just return back silently. */
1251 fprintf (dump_file
, "\n;; Considering simply peeling loop\n");
1253 /* npeel = number of iterations to peel. */
1254 npeel
= PARAM_VALUE (PARAM_MAX_PEELED_INSNS
) / loop
->ninsns
;
1255 if (npeel
> (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES
))
1256 npeel
= PARAM_VALUE (PARAM_MAX_PEEL_TIMES
);
1258 /* Skip big loops. */
1262 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1266 /* Check for simple loops. */
1267 desc
= get_simple_loop_desc (loop
);
1269 /* Check number of iterations. */
1270 if (desc
->simple_p
&& !desc
->assumptions
&& desc
->const_iter
)
1273 fprintf (dump_file
, ";; Loop iterates constant times\n");
1277 /* Do not simply peel loops with branches inside -- it increases number
1279 if (num_loop_branches (loop
) > 1)
1282 fprintf (dump_file
, ";; Not peeling, contains branches\n");
1286 if (loop
->header
->count
)
1288 unsigned niter
= expected_loop_iterations (loop
);
1289 if (niter
+ 1 > npeel
)
1293 fprintf (dump_file
, ";; Not peeling loop, rolls too much (");
1294 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
1295 (HOST_WIDEST_INT
) (niter
+ 1));
1296 fprintf (dump_file
, " iterations > %d [maximum peelings])\n",
1305 /* For now we have no good heuristics to decide whether loop peeling
1306 will be effective, so disable it. */
1309 ";; Not peeling loop, no evidence it will be profitable\n");
1314 loop
->lpt_decision
.decision
= LPT_PEEL_SIMPLE
;
1315 loop
->lpt_decision
.times
= npeel
;
1318 fprintf (dump_file
, ";; Decided to simply peel the loop, %d times.\n",
1319 loop
->lpt_decision
.times
);
1322 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1328 if (!cond) goto end;
1330 if (!cond) goto end;
1337 peel_loop_simple (struct loop
*loop
)
1340 unsigned npeel
= loop
->lpt_decision
.times
;
1341 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1342 struct opt_info
*opt_info
= NULL
;
1345 if (flag_split_ivs_in_unroller
&& npeel
> 1)
1346 opt_info
= analyze_insns_in_loop (loop
);
1348 wont_exit
= sbitmap_alloc (npeel
+ 1);
1349 sbitmap_zero (wont_exit
);
1351 opt_info_start_duplication (opt_info
);
1353 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1354 npeel
, wont_exit
, NULL
,
1355 NULL
, DLTHE_FLAG_UPDATE_FREQ
1357 ? DLTHE_RECORD_COPY_NUMBER
1365 apply_opt_in_copies (opt_info
, npeel
, false, false);
1366 free_opt_info (opt_info
);
1371 if (desc
->const_iter
)
1373 desc
->niter
-= npeel
;
1374 desc
->niter_expr
= GEN_INT (desc
->niter
);
1375 desc
->noloop_assumptions
= NULL_RTX
;
1379 /* We cannot just update niter_expr, as its value might be clobbered
1380 inside loop. We could handle this by counting the number into
1381 temporary just like we do in runtime unrolling, but it does not
1383 free_simple_loop_desc (loop
);
1387 fprintf (dump_file
, ";; Peeling loop %d times\n", npeel
);
1390 /* Decide whether to unroll LOOP stupidly and how much. */
1392 decide_unroll_stupid (struct loop
*loop
, int flags
)
1394 unsigned nunroll
, nunroll_by_av
, i
;
1395 struct niter_desc
*desc
;
1397 if (!(flags
& UAP_UNROLL_ALL
))
1399 /* We were not asked to, just return back silently. */
1404 fprintf (dump_file
, "\n;; Considering unrolling loop stupidly\n");
1406 /* nunroll = total number of copies of the original loop body in
1407 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1408 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
1410 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
1411 if (nunroll
> nunroll_by_av
)
1412 nunroll
= nunroll_by_av
;
1413 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
1414 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
1416 /* Skip big loops. */
1420 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1424 /* Check for simple loops. */
1425 desc
= get_simple_loop_desc (loop
);
1427 /* Check simpleness. */
1428 if (desc
->simple_p
&& !desc
->assumptions
)
1431 fprintf (dump_file
, ";; The loop is simple\n");
1435 /* Do not unroll loops with branches inside -- it increases number
1437 if (num_loop_branches (loop
) > 1)
1440 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1444 /* If we have profile feedback, check whether the loop rolls. */
1445 if (loop
->header
->count
1446 && expected_loop_iterations (loop
) < 2 * nunroll
)
1449 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1453 /* Success. Now force nunroll to be power of 2, as it seems that this
1454 improves results (partially because of better alignments, partially
1455 because of some dark magic). */
1456 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1459 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1460 loop
->lpt_decision
.times
= i
- 1;
1464 ";; Decided to unroll the loop stupidly, %d times.\n",
1465 loop
->lpt_decision
.times
);
1468 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1486 unroll_loop_stupid (struct loop
*loop
)
1489 unsigned nunroll
= loop
->lpt_decision
.times
;
1490 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1491 struct opt_info
*opt_info
= NULL
;
1494 if (flag_split_ivs_in_unroller
1495 || flag_variable_expansion_in_unroller
)
1496 opt_info
= analyze_insns_in_loop (loop
);
1499 wont_exit
= sbitmap_alloc (nunroll
+ 1);
1500 sbitmap_zero (wont_exit
);
1501 opt_info_start_duplication (opt_info
);
1503 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1506 DLTHE_FLAG_UPDATE_FREQ
1508 ? DLTHE_RECORD_COPY_NUMBER
1514 apply_opt_in_copies (opt_info
, nunroll
, true, true);
1515 free_opt_info (opt_info
);
1522 /* We indeed may get here provided that there are nontrivial assumptions
1523 for a loop to be really simple. We could update the counts, but the
1524 problem is that we are unable to decide which exit will be taken
1525 (not really true in case the number of iterations is constant,
1526 but noone will do anything with this information, so we do not
1528 desc
->simple_p
= false;
1532 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1533 nunroll
, num_loop_insns (loop
));
1536 /* A hash function for information about insns to split. */
1539 si_info_hash (const void *ivts
)
1541 return (hashval_t
) INSN_UID (((const struct iv_to_split
*) ivts
)->insn
);
1544 /* An equality functions for information about insns to split. */
1547 si_info_eq (const void *ivts1
, const void *ivts2
)
1549 const struct iv_to_split
*const i1
= (const struct iv_to_split
*) ivts1
;
1550 const struct iv_to_split
*const i2
= (const struct iv_to_split
*) ivts2
;
1552 return i1
->insn
== i2
->insn
;
1555 /* Return a hash for VES, which is really a "var_to_expand *". */
1558 ve_info_hash (const void *ves
)
1560 return (hashval_t
) INSN_UID (((const struct var_to_expand
*) ves
)->insn
);
1563 /* Return true if IVTS1 and IVTS2 (which are really both of type
1564 "var_to_expand *") refer to the same instruction. */
1567 ve_info_eq (const void *ivts1
, const void *ivts2
)
1569 const struct var_to_expand
*const i1
= (const struct var_to_expand
*) ivts1
;
1570 const struct var_to_expand
*const i2
= (const struct var_to_expand
*) ivts2
;
1572 return i1
->insn
== i2
->insn
;
1575 /* Returns true if REG is referenced in one insn in LOOP. */
1578 referenced_in_one_insn_in_loop_p (struct loop
*loop
, rtx reg
)
1580 basic_block
*body
, bb
;
1585 body
= get_loop_body (loop
);
1586 for (i
= 0; i
< loop
->num_nodes
; i
++)
1590 FOR_BB_INSNS (bb
, insn
)
1592 if (rtx_referenced_p (reg
, insn
))
1596 return (count_ref
== 1);
1599 /* Determine whether INSN contains an accumulator
1600 which can be expanded into separate copies,
1601 one for each copy of the LOOP body.
1603 for (i = 0 ; i < n; i++)
1617 Return NULL if INSN contains no opportunity for expansion of accumulator.
1618 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1619 information and return a pointer to it.
1622 static struct var_to_expand
*
1623 analyze_insn_to_expand_var (struct loop
*loop
, rtx insn
)
1625 rtx set
, dest
, src
, op1
, op2
, something
;
1626 struct var_to_expand
*ves
;
1627 enum machine_mode mode1
, mode2
;
1630 set
= single_set (insn
);
1634 dest
= SET_DEST (set
);
1635 src
= SET_SRC (set
);
1637 if (GET_CODE (src
) != PLUS
1638 && GET_CODE (src
) != MINUS
1639 && GET_CODE (src
) != MULT
)
1642 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1643 in MD. But if there is no optab to generate the insn, we can not
1644 perform the variable expansion. This can happen if an MD provides
1645 an insn but not a named pattern to generate it, for example to avoid
1646 producing code that needs additional mode switches like for x87/mmx.
1648 So we check have_insn_for which looks for an optab for the operation
1649 in SRC. If it doesn't exist, we can't perform the expansion even
1650 though INSN is valid. */
1651 if (!have_insn_for (GET_CODE (src
), GET_MODE (src
)))
1654 op1
= XEXP (src
, 0);
1655 op2
= XEXP (src
, 1);
1658 && !(GET_CODE (dest
) == SUBREG
1659 && REG_P (SUBREG_REG (dest
))))
1662 if (rtx_equal_p (dest
, op1
))
1664 else if (rtx_equal_p (dest
, op2
))
1669 /* The method of expansion that we are using; which includes
1670 the initialization of the expansions with zero and the summation of
1671 the expansions at the end of the computation will yield wrong results
1672 for (x = something - x) thus avoid using it in that case. */
1674 && GET_CODE (src
) == MINUS
)
1677 something
= (accum_pos
== 0)? op2
: op1
;
1679 if (!referenced_in_one_insn_in_loop_p (loop
, dest
))
1682 if (rtx_referenced_p (dest
, something
))
1685 mode1
= GET_MODE (dest
);
1686 mode2
= GET_MODE (something
);
1687 if ((FLOAT_MODE_P (mode1
)
1688 || FLOAT_MODE_P (mode2
))
1689 && !flag_associative_math
)
1695 "\n;; Expanding Accumulator ");
1696 print_rtl (dump_file
, dest
);
1697 fprintf (dump_file
, "\n");
1700 /* Record the accumulator to expand. */
1701 ves
= XNEW (struct var_to_expand
);
1703 ves
->reg
= copy_rtx (dest
);
1704 ves
->var_expansions
= VEC_alloc (rtx
, heap
, 1);
1706 ves
->op
= GET_CODE (src
);
1707 ves
->expansion_count
= 0;
1708 ves
->reuse_expansion
= 0;
1709 ves
->accum_pos
= accum_pos
;
1713 /* Determine whether there is an induction variable in INSN that
1714 we would like to split during unrolling.
1734 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1735 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1738 static struct iv_to_split
*
1739 analyze_iv_to_split_insn (rtx insn
)
1743 struct iv_to_split
*ivts
;
1746 /* For now we just split the basic induction variables. Later this may be
1747 extended for example by selecting also addresses of memory references. */
1748 set
= single_set (insn
);
1752 dest
= SET_DEST (set
);
1756 if (!biv_p (insn
, dest
))
1759 ok
= iv_analyze_result (insn
, dest
, &iv
);
1761 /* This used to be an assert under the assumption that if biv_p returns
1762 true that iv_analyze_result must also return true. However, that
1763 assumption is not strictly correct as evidenced by pr25569.
1765 Returning NULL when iv_analyze_result returns false is safe and
1766 avoids the problems in pr25569 until the iv_analyze_* routines
1767 can be fixed, which is apparently hard and time consuming
1768 according to their author. */
1772 if (iv
.step
== const0_rtx
1773 || iv
.mode
!= iv
.extend_mode
)
1776 /* Record the insn to split. */
1777 ivts
= XNEW (struct iv_to_split
);
1779 ivts
->base_var
= NULL_RTX
;
1780 ivts
->step
= iv
.step
;
1788 /* Determines which of insns in LOOP can be optimized.
1789 Return a OPT_INFO struct with the relevant hash tables filled
1790 with all insns to be optimized. The FIRST_NEW_BLOCK field
1791 is undefined for the return value. */
1793 static struct opt_info
*
1794 analyze_insns_in_loop (struct loop
*loop
)
1796 basic_block
*body
, bb
;
1798 struct opt_info
*opt_info
= XCNEW (struct opt_info
);
1800 struct iv_to_split
*ivts
= NULL
;
1801 struct var_to_expand
*ves
= NULL
;
1804 VEC (edge
, heap
) *edges
= get_loop_exit_edges (loop
);
1806 bool can_apply
= false;
1808 iv_analysis_loop_init (loop
);
1810 body
= get_loop_body (loop
);
1812 if (flag_split_ivs_in_unroller
)
1814 opt_info
->insns_to_split
= htab_create (5 * loop
->num_nodes
,
1815 si_info_hash
, si_info_eq
, free
);
1816 opt_info
->iv_to_split_head
= NULL
;
1817 opt_info
->iv_to_split_tail
= &opt_info
->iv_to_split_head
;
1820 /* Record the loop exit bb and loop preheader before the unrolling. */
1821 opt_info
->loop_preheader
= loop_preheader_edge (loop
)->src
;
1823 if (VEC_length (edge
, edges
) == 1)
1825 exit
= VEC_index (edge
, edges
, 0);
1826 if (!(exit
->flags
& EDGE_COMPLEX
))
1828 opt_info
->loop_exit
= split_edge (exit
);
1833 if (flag_variable_expansion_in_unroller
1836 opt_info
->insns_with_var_to_expand
= htab_create (5 * loop
->num_nodes
,
1839 opt_info
->var_to_expand_head
= NULL
;
1840 opt_info
->var_to_expand_tail
= &opt_info
->var_to_expand_head
;
1843 for (i
= 0; i
< loop
->num_nodes
; i
++)
1846 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1849 FOR_BB_INSNS (bb
, insn
)
1854 if (opt_info
->insns_to_split
)
1855 ivts
= analyze_iv_to_split_insn (insn
);
1859 slot1
= htab_find_slot (opt_info
->insns_to_split
, ivts
, INSERT
);
1860 gcc_assert (*slot1
== NULL
);
1862 *opt_info
->iv_to_split_tail
= ivts
;
1863 opt_info
->iv_to_split_tail
= &ivts
->next
;
1867 if (opt_info
->insns_with_var_to_expand
)
1868 ves
= analyze_insn_to_expand_var (loop
, insn
);
1872 slot2
= htab_find_slot (opt_info
->insns_with_var_to_expand
, ves
, INSERT
);
1873 gcc_assert (*slot2
== NULL
);
1875 *opt_info
->var_to_expand_tail
= ves
;
1876 opt_info
->var_to_expand_tail
= &ves
->next
;
1881 VEC_free (edge
, heap
, edges
);
1886 /* Called just before loop duplication. Records start of duplicated area
1890 opt_info_start_duplication (struct opt_info
*opt_info
)
1893 opt_info
->first_new_block
= last_basic_block
;
1896 /* Determine the number of iterations between initialization of the base
1897 variable and the current copy (N_COPY). N_COPIES is the total number
1898 of newly created copies. UNROLLING is true if we are unrolling
1899 (not peeling) the loop. */
1902 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1906 /* If we are unrolling, initialization is done in the original loop
1912 /* If we are peeling, the copy in that the initialization occurs has
1913 number 1. The original loop (number 0) is the last. */
1921 /* Locate in EXPR the expression corresponding to the location recorded
1922 in IVTS, and return a pointer to the RTX for this location. */
1925 get_ivts_expr (rtx expr
, struct iv_to_split
*ivts
)
1930 for (i
= 0; i
< ivts
->n_loc
; i
++)
1931 ret
= &XEXP (*ret
, ivts
->loc
[i
]);
1936 /* Allocate basic variable for the induction variable chain. */
1939 allocate_basic_variable (struct iv_to_split
*ivts
)
1941 rtx expr
= *get_ivts_expr (single_set (ivts
->insn
), ivts
);
1943 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1946 /* Insert initialization of basic variable of IVTS before INSN, taking
1947 the initial value from INSN. */
1950 insert_base_initialization (struct iv_to_split
*ivts
, rtx insn
)
1952 rtx expr
= copy_rtx (*get_ivts_expr (single_set (insn
), ivts
));
1956 expr
= force_operand (expr
, ivts
->base_var
);
1957 if (expr
!= ivts
->base_var
)
1958 emit_move_insn (ivts
->base_var
, expr
);
1962 emit_insn_before (seq
, insn
);
1965 /* Replace the use of induction variable described in IVTS in INSN
1966 by base variable + DELTA * step. */
1969 split_iv (struct iv_to_split
*ivts
, rtx insn
, unsigned delta
)
1971 rtx expr
, *loc
, seq
, incr
, var
;
1972 enum machine_mode mode
= GET_MODE (ivts
->base_var
);
1975 /* Construct base + DELTA * step. */
1977 expr
= ivts
->base_var
;
1980 incr
= simplify_gen_binary (MULT
, mode
,
1981 ivts
->step
, gen_int_mode (delta
, mode
));
1982 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1983 ivts
->base_var
, incr
);
1986 /* Figure out where to do the replacement. */
1987 loc
= get_ivts_expr (single_set (insn
), ivts
);
1989 /* If we can make the replacement right away, we're done. */
1990 if (validate_change (insn
, loc
, expr
, 0))
1993 /* Otherwise, force EXPR into a register and try again. */
1995 var
= gen_reg_rtx (mode
);
1996 expr
= force_operand (expr
, var
);
1998 emit_move_insn (var
, expr
);
2001 emit_insn_before (seq
, insn
);
2003 if (validate_change (insn
, loc
, var
, 0))
2006 /* The last chance. Try recreating the assignment in insn
2007 completely from scratch. */
2008 set
= single_set (insn
);
2013 src
= copy_rtx (SET_SRC (set
));
2014 dest
= copy_rtx (SET_DEST (set
));
2015 src
= force_operand (src
, dest
);
2017 emit_move_insn (dest
, src
);
2021 emit_insn_before (seq
, insn
);
2026 /* Return one expansion of the accumulator recorded in struct VE. */
2029 get_expansion (struct var_to_expand
*ve
)
2033 if (ve
->reuse_expansion
== 0)
2036 reg
= VEC_index (rtx
, ve
->var_expansions
, ve
->reuse_expansion
- 1);
2038 if (VEC_length (rtx
, ve
->var_expansions
) == (unsigned) ve
->reuse_expansion
)
2039 ve
->reuse_expansion
= 0;
2041 ve
->reuse_expansion
++;
2047 /* Given INSN replace the uses of the accumulator recorded in VE
2048 with a new register. */
2051 expand_var_during_unrolling (struct var_to_expand
*ve
, rtx insn
)
2054 bool really_new_expansion
= false;
2056 set
= single_set (insn
);
2059 /* Generate a new register only if the expansion limit has not been
2060 reached. Else reuse an already existing expansion. */
2061 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS
) > ve
->expansion_count
)
2063 really_new_expansion
= true;
2064 new_reg
= gen_reg_rtx (GET_MODE (ve
->reg
));
2067 new_reg
= get_expansion (ve
);
2069 validate_change (insn
, &SET_DEST (set
), new_reg
, 1);
2070 validate_change (insn
, &XEXP (SET_SRC (set
), ve
->accum_pos
), new_reg
, 1);
2072 if (apply_change_group ())
2073 if (really_new_expansion
)
2075 VEC_safe_push (rtx
, heap
, ve
->var_expansions
, new_reg
);
2076 ve
->expansion_count
++;
2080 /* Initialize the variable expansions in loop preheader. PLACE is the
2081 loop-preheader basic block where the initialization of the
2082 expansions should take place. The expansions are initialized with
2083 (-0) when the operation is plus or minus to honor sign zero. This
2084 way we can prevent cases where the sign of the final result is
2085 effected by the sign of the expansion. Here is an example to
2088 for (i = 0 ; i < n; i++)
2102 When SUM is initialized with -zero and SOMETHING is also -zero; the
2103 final result of sum should be -zero thus the expansions sum1 and sum2
2104 should be initialized with -zero as well (otherwise we will get +zero
2105 as the final result). */
2108 insert_var_expansion_initialization (struct var_to_expand
*ve
,
2111 rtx seq
, var
, zero_init
, insn
;
2113 enum machine_mode mode
= GET_MODE (ve
->reg
);
2114 bool honor_signed_zero_p
= HONOR_SIGNED_ZEROS (mode
);
2116 if (VEC_length (rtx
, ve
->var_expansions
) == 0)
2120 if (ve
->op
== PLUS
|| ve
->op
== MINUS
)
2121 for (i
= 0; VEC_iterate (rtx
, ve
->var_expansions
, i
, var
); i
++)
2123 if (honor_signed_zero_p
)
2124 zero_init
= simplify_gen_unary (NEG
, mode
, CONST0_RTX (mode
), mode
);
2126 zero_init
= CONST0_RTX (mode
);
2128 emit_move_insn (var
, zero_init
);
2130 else if (ve
->op
== MULT
)
2131 for (i
= 0; VEC_iterate (rtx
, ve
->var_expansions
, i
, var
); i
++)
2133 zero_init
= CONST1_RTX (GET_MODE (var
));
2134 emit_move_insn (var
, zero_init
);
2140 insn
= BB_HEAD (place
);
2141 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
2142 insn
= NEXT_INSN (insn
);
2144 emit_insn_after (seq
, insn
);
2147 /* Combine the variable expansions at the loop exit. PLACE is the
2148 loop exit basic block where the summation of the expansions should
2152 combine_var_copies_in_loop_exit (struct var_to_expand
*ve
, basic_block place
)
2155 rtx expr
, seq
, var
, insn
;
2158 if (VEC_length (rtx
, ve
->var_expansions
) == 0)
2162 if (ve
->op
== PLUS
|| ve
->op
== MINUS
)
2163 for (i
= 0; VEC_iterate (rtx
, ve
->var_expansions
, i
, var
); i
++)
2165 sum
= simplify_gen_binary (PLUS
, GET_MODE (ve
->reg
),
2168 else if (ve
->op
== MULT
)
2169 for (i
= 0; VEC_iterate (rtx
, ve
->var_expansions
, i
, var
); i
++)
2171 sum
= simplify_gen_binary (MULT
, GET_MODE (ve
->reg
),
2175 expr
= force_operand (sum
, ve
->reg
);
2176 if (expr
!= ve
->reg
)
2177 emit_move_insn (ve
->reg
, expr
);
2181 insn
= BB_HEAD (place
);
2182 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
2183 insn
= NEXT_INSN (insn
);
2185 emit_insn_after (seq
, insn
);
2188 /* Apply loop optimizations in loop copies using the
2189 data which gathered during the unrolling. Structure
2190 OPT_INFO record that data.
2192 UNROLLING is true if we unrolled (not peeled) the loop.
2193 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2194 the loop (as it should happen in complete unrolling, but not in ordinary
2195 peeling of the loop). */
2198 apply_opt_in_copies (struct opt_info
*opt_info
,
2199 unsigned n_copies
, bool unrolling
,
2200 bool rewrite_original_loop
)
2203 basic_block bb
, orig_bb
;
2204 rtx insn
, orig_insn
, next
;
2205 struct iv_to_split ivts_templ
, *ivts
;
2206 struct var_to_expand ve_templ
, *ves
;
2208 /* Sanity check -- we need to put initialization in the original loop
2210 gcc_assert (!unrolling
|| rewrite_original_loop
);
2212 /* Allocate the basic variables (i0). */
2213 if (opt_info
->insns_to_split
)
2214 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
2215 allocate_basic_variable (ivts
);
2217 for (i
= opt_info
->first_new_block
; i
< (unsigned) last_basic_block
; i
++)
2219 bb
= BASIC_BLOCK (i
);
2220 orig_bb
= get_bb_original (bb
);
2222 /* bb->aux holds position in copy sequence initialized by
2223 duplicate_loop_to_header_edge. */
2224 delta
= determine_split_iv_delta ((size_t)bb
->aux
, n_copies
,
2227 orig_insn
= BB_HEAD (orig_bb
);
2228 for (insn
= BB_HEAD (bb
); insn
!= NEXT_INSN (BB_END (bb
)); insn
= next
)
2230 next
= NEXT_INSN (insn
);
2234 while (!INSN_P (orig_insn
))
2235 orig_insn
= NEXT_INSN (orig_insn
);
2237 ivts_templ
.insn
= orig_insn
;
2238 ve_templ
.insn
= orig_insn
;
2240 /* Apply splitting iv optimization. */
2241 if (opt_info
->insns_to_split
)
2243 ivts
= (struct iv_to_split
*)
2244 htab_find (opt_info
->insns_to_split
, &ivts_templ
);
2248 gcc_assert (GET_CODE (PATTERN (insn
))
2249 == GET_CODE (PATTERN (orig_insn
)));
2252 insert_base_initialization (ivts
, insn
);
2253 split_iv (ivts
, insn
, delta
);
2256 /* Apply variable expansion optimization. */
2257 if (unrolling
&& opt_info
->insns_with_var_to_expand
)
2259 ves
= (struct var_to_expand
*)
2260 htab_find (opt_info
->insns_with_var_to_expand
, &ve_templ
);
2263 gcc_assert (GET_CODE (PATTERN (insn
))
2264 == GET_CODE (PATTERN (orig_insn
)));
2265 expand_var_during_unrolling (ves
, insn
);
2268 orig_insn
= NEXT_INSN (orig_insn
);
2272 if (!rewrite_original_loop
)
2275 /* Initialize the variable expansions in the loop preheader
2276 and take care of combining them at the loop exit. */
2277 if (opt_info
->insns_with_var_to_expand
)
2279 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2280 insert_var_expansion_initialization (ves
, opt_info
->loop_preheader
);
2281 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2282 combine_var_copies_in_loop_exit (ves
, opt_info
->loop_exit
);
2285 /* Rewrite also the original loop body. Find them as originals of the blocks
2286 in the last copied iteration, i.e. those that have
2287 get_bb_copy (get_bb_original (bb)) == bb. */
2288 for (i
= opt_info
->first_new_block
; i
< (unsigned) last_basic_block
; i
++)
2290 bb
= BASIC_BLOCK (i
);
2291 orig_bb
= get_bb_original (bb
);
2292 if (get_bb_copy (orig_bb
) != bb
)
2295 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
2296 for (orig_insn
= BB_HEAD (orig_bb
);
2297 orig_insn
!= NEXT_INSN (BB_END (bb
));
2300 next
= NEXT_INSN (orig_insn
);
2302 if (!INSN_P (orig_insn
))
2305 ivts_templ
.insn
= orig_insn
;
2306 if (opt_info
->insns_to_split
)
2308 ivts
= (struct iv_to_split
*)
2309 htab_find (opt_info
->insns_to_split
, &ivts_templ
);
2313 insert_base_initialization (ivts
, orig_insn
);
2314 split_iv (ivts
, orig_insn
, delta
);
2323 /* Release OPT_INFO. */
2326 free_opt_info (struct opt_info
*opt_info
)
2328 if (opt_info
->insns_to_split
)
2329 htab_delete (opt_info
->insns_to_split
);
2330 if (opt_info
->insns_with_var_to_expand
)
2332 struct var_to_expand
*ves
;
2334 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2335 VEC_free (rtx
, heap
, ves
->var_expansions
);
2336 htab_delete (opt_info
->insns_with_var_to_expand
);