2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
40 /* This pass performs loop unrolling. We only perform this
41 optimization on innermost loops (with single exception) because
42 the impact on performance is greatest here, and we want to avoid
43 unnecessary code size growth. The gain is caused by greater sequentiality
44 of code, better code to optimize for further passes and in some cases
45 by fewer testings of exit conditions. The main problem is code growth,
46 that impacts performance negatively due to effect of caches.
50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
61 There is a lot of parameters (defined and described in params.def) that
62 control how much we unroll.
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
69 /* Information about induction variables to split. */
73 rtx_insn
*insn
; /* The insn in that the induction variable occurs. */
74 rtx orig_var
; /* The variable (register) for the IV before split. */
75 rtx base_var
; /* The variable on that the values in the further
76 iterations are based. */
77 rtx step
; /* Step of the induction variable. */
78 struct iv_to_split
*next
; /* Next entry in walking order. */
81 /* Information about accumulators to expand. */
85 rtx_insn
*insn
; /* The insn in that the variable expansion occurs. */
86 rtx reg
; /* The accumulator which is expanded. */
87 vec
<rtx
> var_expansions
; /* The copies of the accumulator which is expanded. */
88 struct var_to_expand
*next
; /* Next entry in walking order. */
89 enum rtx_code op
; /* The type of the accumulation - addition, subtraction
91 int expansion_count
; /* Count the number of expansions generated so far. */
92 int reuse_expansion
; /* The expansion we intend to reuse to expand
93 the accumulator. If REUSE_EXPANSION is 0 reuse
94 the original accumulator. Else use
95 var_expansions[REUSE_EXPANSION - 1]. */
98 /* Hashtable helper for iv_to_split. */
100 struct iv_split_hasher
: free_ptr_hash
<iv_to_split
>
102 static inline hashval_t
hash (const iv_to_split
*);
103 static inline bool equal (const iv_to_split
*, const iv_to_split
*);
107 /* A hash function for information about insns to split. */
110 iv_split_hasher::hash (const iv_to_split
*ivts
)
112 return (hashval_t
) INSN_UID (ivts
->insn
);
115 /* An equality functions for information about insns to split. */
118 iv_split_hasher::equal (const iv_to_split
*i1
, const iv_to_split
*i2
)
120 return i1
->insn
== i2
->insn
;
123 /* Hashtable helper for iv_to_split. */
125 struct var_expand_hasher
: free_ptr_hash
<var_to_expand
>
127 static inline hashval_t
hash (const var_to_expand
*);
128 static inline bool equal (const var_to_expand
*, const var_to_expand
*);
131 /* Return a hash for VES. */
134 var_expand_hasher::hash (const var_to_expand
*ves
)
136 return (hashval_t
) INSN_UID (ves
->insn
);
139 /* Return true if I1 and I2 refer to the same instruction. */
142 var_expand_hasher::equal (const var_to_expand
*i1
, const var_to_expand
*i2
)
144 return i1
->insn
== i2
->insn
;
147 /* Information about optimization applied in
148 the unrolled loop. */
152 hash_table
<iv_split_hasher
> *insns_to_split
; /* A hashtable of insns to
154 struct iv_to_split
*iv_to_split_head
; /* The first iv to split. */
155 struct iv_to_split
**iv_to_split_tail
; /* Pointer to the tail of the list. */
156 hash_table
<var_expand_hasher
> *insns_with_var_to_expand
; /* A hashtable of
157 insns with accumulators to expand. */
158 struct var_to_expand
*var_to_expand_head
; /* The first var to expand. */
159 struct var_to_expand
**var_to_expand_tail
; /* Pointer to the tail of the list. */
160 unsigned first_new_block
; /* The first basic block that was
162 basic_block loop_exit
; /* The loop exit basic block. */
163 basic_block loop_preheader
; /* The loop preheader basic block. */
166 static void decide_unroll_stupid (class loop
*, int);
167 static void decide_unroll_constant_iterations (class loop
*, int);
168 static void decide_unroll_runtime_iterations (class loop
*, int);
169 static void unroll_loop_stupid (class loop
*);
170 static void decide_unrolling (int);
171 static void unroll_loop_constant_iterations (class loop
*);
172 static void unroll_loop_runtime_iterations (class loop
*);
173 static struct opt_info
*analyze_insns_in_loop (class loop
*);
174 static void opt_info_start_duplication (struct opt_info
*);
175 static void apply_opt_in_copies (struct opt_info
*, unsigned, bool, bool);
176 static void free_opt_info (struct opt_info
*);
177 static struct var_to_expand
*analyze_insn_to_expand_var (class loop
*, rtx_insn
*);
178 static bool referenced_in_one_insn_in_loop_p (class loop
*, rtx
, int *);
179 static struct iv_to_split
*analyze_iv_to_split_insn (rtx_insn
*);
180 static void expand_var_during_unrolling (struct var_to_expand
*, rtx_insn
*);
181 static void insert_var_expansion_initialization (struct var_to_expand
*,
183 static void combine_var_copies_in_loop_exit (struct var_to_expand
*,
185 static rtx
get_expansion (struct var_to_expand
*);
187 /* Emit a message summarizing the unroll that will be
188 performed for LOOP, along with the loop's location LOCUS, if
189 appropriate given the dump or -fopt-info settings. */
192 report_unroll (class loop
*loop
, dump_location_t locus
)
194 dump_flags_t report_flags
= MSG_OPTIMIZED_LOCATIONS
| TDF_DETAILS
;
196 if (loop
->lpt_decision
.decision
== LPT_NONE
)
199 if (!dump_enabled_p ())
202 dump_metadata_t
metadata (report_flags
, locus
.get_impl_location ());
203 dump_printf_loc (metadata
, locus
.get_user_location (),
204 "loop unrolled %d times",
205 loop
->lpt_decision
.times
);
206 if (profile_info
&& loop
->header
->count
.initialized_p ())
207 dump_printf (metadata
,
208 " (header execution count %d)",
209 (int)loop
->header
->count
.to_gcov_type ());
211 dump_printf (metadata
, "\n");
214 /* Decide whether unroll loops and how much. */
216 decide_unrolling (int flags
)
220 /* Scan the loops, inner ones first. */
221 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
223 loop
->lpt_decision
.decision
= LPT_NONE
;
224 dump_user_location_t locus
= get_loop_location (loop
);
226 if (dump_enabled_p ())
227 dump_printf_loc (MSG_NOTE
, locus
,
228 "considering unrolling loop %d at BB %d\n",
229 loop
->num
, loop
->header
->index
);
231 if (loop
->unroll
== 1)
235 ";; Not unrolling loop, user didn't want it unrolled\n");
239 /* Do not peel cold areas. */
240 if (optimize_loop_for_size_p (loop
))
243 fprintf (dump_file
, ";; Not considering loop, cold area\n");
247 /* Can the loop be manipulated? */
248 if (!can_duplicate_loop_p (loop
))
252 ";; Not considering loop, cannot duplicate\n");
256 /* Skip non-innermost loops. */
260 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
264 loop
->ninsns
= num_loop_insns (loop
);
265 loop
->av_ninsns
= average_num_loop_insns (loop
);
267 /* Try transformations one by one in decreasing order of priority. */
268 decide_unroll_constant_iterations (loop
, flags
);
269 if (loop
->lpt_decision
.decision
== LPT_NONE
)
270 decide_unroll_runtime_iterations (loop
, flags
);
271 if (loop
->lpt_decision
.decision
== LPT_NONE
)
272 decide_unroll_stupid (loop
, flags
);
274 report_unroll (loop
, locus
);
280 unroll_loops (int flags
)
283 bool changed
= false;
285 /* Now decide rest of unrolling. */
286 decide_unrolling (flags
);
288 /* Scan the loops, inner ones first. */
289 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
291 /* And perform the appropriate transformations. */
292 switch (loop
->lpt_decision
.decision
)
294 case LPT_UNROLL_CONSTANT
:
295 unroll_loop_constant_iterations (loop
);
298 case LPT_UNROLL_RUNTIME
:
299 unroll_loop_runtime_iterations (loop
);
302 case LPT_UNROLL_STUPID
:
303 unroll_loop_stupid (loop
);
315 calculate_dominance_info (CDI_DOMINATORS
);
316 fix_loop_structure (NULL
);
322 /* Check whether exit of the LOOP is at the end of loop body. */
325 loop_exit_at_end_p (class loop
*loop
)
327 class niter_desc
*desc
= get_simple_loop_desc (loop
);
330 /* We should never have conditional in latch block. */
331 gcc_assert (desc
->in_edge
->dest
!= loop
->header
);
333 if (desc
->in_edge
->dest
!= loop
->latch
)
336 /* Check that the latch is empty. */
337 FOR_BB_INSNS (loop
->latch
, insn
)
339 if (INSN_P (insn
) && active_insn_p (insn
))
346 /* Decide whether to unroll LOOP iterating constant number of times
350 decide_unroll_constant_iterations (class loop
*loop
, int flags
)
352 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
353 class niter_desc
*desc
;
354 widest_int iterations
;
356 /* If we were not asked to unroll this loop, just return back silently. */
357 if (!(flags
& UAP_UNROLL
) && !loop
->unroll
)
360 if (dump_enabled_p ())
361 dump_printf (MSG_NOTE
,
362 "considering unrolling loop with constant "
363 "number of iterations\n");
365 /* nunroll = total number of copies of the original loop body in
366 unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
367 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
369 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
370 if (nunroll
> nunroll_by_av
)
371 nunroll
= nunroll_by_av
;
372 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
373 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
375 if (targetm
.loop_unroll_adjust
)
376 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
378 /* Skip big loops. */
382 fprintf (dump_file
, ";; Not considering loop, is too big\n");
386 /* Check for simple loops. */
387 desc
= get_simple_loop_desc (loop
);
389 /* Check number of iterations. */
390 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
394 ";; Unable to prove that the loop iterates constant times\n");
398 /* Check for an explicit unrolling factor. */
399 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
401 /* However we cannot unroll completely at the RTL level a loop with
402 constant number of iterations; it should have been peeled instead. */
403 if (desc
->niter
== 0 || (unsigned) loop
->unroll
> desc
->niter
- 1)
406 fprintf (dump_file
, ";; Loop should have been peeled\n");
410 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
411 loop
->lpt_decision
.times
= loop
->unroll
- 1;
416 /* Check whether the loop rolls enough to consider.
417 Consult also loop bounds and profile; in the case the loop has more
418 than one exit it may well loop less than determined maximal number
420 if (desc
->niter
< 2 * nunroll
421 || ((get_estimated_loop_iterations (loop
, &iterations
)
422 || get_likely_max_loop_iterations (loop
, &iterations
))
423 && wi::ltu_p (iterations
, 2 * nunroll
)))
426 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
430 /* Success; now compute number of iterations to unroll. We alter
431 nunroll so that as few as possible copies of loop body are
432 necessary, while still not decreasing the number of unrollings
433 too much (at most by 1). */
434 best_copies
= 2 * nunroll
+ 10;
437 if (i
> desc
->niter
- 2)
440 for (; i
>= nunroll
- 1; i
--)
442 unsigned exit_mod
= desc
->niter
% (i
+ 1);
444 if (!loop_exit_at_end_p (loop
))
445 n_copies
= exit_mod
+ i
+ 1;
446 else if (exit_mod
!= (unsigned) i
447 || desc
->noloop_assumptions
!= NULL_RTX
)
448 n_copies
= exit_mod
+ i
+ 2;
452 if (n_copies
< best_copies
)
454 best_copies
= n_copies
;
459 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
460 loop
->lpt_decision
.times
= best_unroll
;
463 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
464 The transformation does this:
466 for (i = 0; i < 102; i++)
469 ==> (LOOP->LPT_DECISION.TIMES == 3)
483 unroll_loop_constant_iterations (class loop
*loop
)
485 unsigned HOST_WIDE_INT niter
;
489 unsigned max_unroll
= loop
->lpt_decision
.times
;
490 class niter_desc
*desc
= get_simple_loop_desc (loop
);
491 bool exit_at_end
= loop_exit_at_end_p (loop
);
492 struct opt_info
*opt_info
= NULL
;
497 /* Should not get here (such loop should be peeled instead). */
498 gcc_assert (niter
> max_unroll
+ 1);
500 exit_mod
= niter
% (max_unroll
+ 1);
502 auto_sbitmap
wont_exit (max_unroll
+ 2);
503 bitmap_ones (wont_exit
);
505 auto_vec
<edge
> remove_edges
;
506 if (flag_split_ivs_in_unroller
507 || flag_variable_expansion_in_unroller
)
508 opt_info
= analyze_insns_in_loop (loop
);
512 /* The exit is not at the end of the loop; leave exit test
513 in the first copy, so that the loops that start with test
514 of exit condition have continuous body after unrolling. */
517 fprintf (dump_file
, ";; Condition at beginning of loop.\n");
519 /* Peel exit_mod iterations. */
520 bitmap_clear_bit (wont_exit
, 0);
521 if (desc
->noloop_assumptions
)
522 bitmap_clear_bit (wont_exit
, 1);
526 opt_info_start_duplication (opt_info
);
527 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
529 wont_exit
, desc
->out_edge
,
531 DLTHE_FLAG_UPDATE_FREQ
532 | (opt_info
&& exit_mod
> 1
533 ? DLTHE_RECORD_COPY_NUMBER
537 if (opt_info
&& exit_mod
> 1)
538 apply_opt_in_copies (opt_info
, exit_mod
, false, false);
540 desc
->noloop_assumptions
= NULL_RTX
;
541 desc
->niter
-= exit_mod
;
542 loop
->nb_iterations_upper_bound
-= exit_mod
;
543 if (loop
->any_estimate
544 && wi::leu_p (exit_mod
, loop
->nb_iterations_estimate
))
545 loop
->nb_iterations_estimate
-= exit_mod
;
547 loop
->any_estimate
= false;
548 if (loop
->any_likely_upper_bound
549 && wi::leu_p (exit_mod
, loop
->nb_iterations_likely_upper_bound
))
550 loop
->nb_iterations_likely_upper_bound
-= exit_mod
;
552 loop
->any_likely_upper_bound
= false;
555 bitmap_set_bit (wont_exit
, 1);
559 /* Leave exit test in last copy, for the same reason as above if
560 the loop tests the condition at the end of loop body. */
563 fprintf (dump_file
, ";; Condition at end of loop.\n");
565 /* We know that niter >= max_unroll + 2; so we do not need to care of
566 case when we would exit before reaching the loop. So just peel
567 exit_mod + 1 iterations. */
568 if (exit_mod
!= max_unroll
569 || desc
->noloop_assumptions
)
571 bitmap_clear_bit (wont_exit
, 0);
572 if (desc
->noloop_assumptions
)
573 bitmap_clear_bit (wont_exit
, 1);
575 opt_info_start_duplication (opt_info
);
576 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
578 wont_exit
, desc
->out_edge
,
580 DLTHE_FLAG_UPDATE_FREQ
581 | (opt_info
&& exit_mod
> 0
582 ? DLTHE_RECORD_COPY_NUMBER
586 if (opt_info
&& exit_mod
> 0)
587 apply_opt_in_copies (opt_info
, exit_mod
+ 1, false, false);
589 desc
->niter
-= exit_mod
+ 1;
590 loop
->nb_iterations_upper_bound
-= exit_mod
+ 1;
591 if (loop
->any_estimate
592 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_estimate
))
593 loop
->nb_iterations_estimate
-= exit_mod
+ 1;
595 loop
->any_estimate
= false;
596 if (loop
->any_likely_upper_bound
597 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_likely_upper_bound
))
598 loop
->nb_iterations_likely_upper_bound
-= exit_mod
+ 1;
600 loop
->any_likely_upper_bound
= false;
601 desc
->noloop_assumptions
= NULL_RTX
;
603 bitmap_set_bit (wont_exit
, 0);
604 bitmap_set_bit (wont_exit
, 1);
607 bitmap_clear_bit (wont_exit
, max_unroll
);
610 /* Now unroll the loop. */
612 opt_info_start_duplication (opt_info
);
613 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
615 wont_exit
, desc
->out_edge
,
617 DLTHE_FLAG_UPDATE_FREQ
619 ? DLTHE_RECORD_COPY_NUMBER
625 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
626 free_opt_info (opt_info
);
631 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
632 /* Find a new in and out edge; they are in the last copy we have made. */
634 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
636 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
637 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
641 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
642 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
646 desc
->niter
/= max_unroll
+ 1;
647 loop
->nb_iterations_upper_bound
648 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
649 if (loop
->any_estimate
)
650 loop
->nb_iterations_estimate
651 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
652 if (loop
->any_likely_upper_bound
)
653 loop
->nb_iterations_likely_upper_bound
654 = wi::udiv_trunc (loop
->nb_iterations_likely_upper_bound
, max_unroll
+ 1);
655 desc
->niter_expr
= gen_int_mode (desc
->niter
, desc
->mode
);
657 /* Remove the edges. */
658 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
663 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
664 max_unroll
, num_loop_insns (loop
));
667 /* Decide whether to unroll LOOP iterating runtime computable number of times
670 decide_unroll_runtime_iterations (class loop
*loop
, int flags
)
672 unsigned nunroll
, nunroll_by_av
, i
;
673 class niter_desc
*desc
;
674 widest_int iterations
;
676 /* If we were not asked to unroll this loop, just return back silently. */
677 if (!(flags
& UAP_UNROLL
) && !loop
->unroll
)
680 if (dump_enabled_p ())
681 dump_printf (MSG_NOTE
,
682 "considering unrolling loop with runtime-"
683 "computable number of iterations\n");
685 /* nunroll = total number of copies of the original loop body in
686 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
687 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
688 nunroll_by_av
= PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
689 if (nunroll
> nunroll_by_av
)
690 nunroll
= nunroll_by_av
;
691 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
692 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
694 if (targetm
.loop_unroll_adjust
)
695 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
697 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
698 nunroll
= loop
->unroll
;
700 /* Skip big loops. */
704 fprintf (dump_file
, ";; Not considering loop, is too big\n");
708 /* Check for simple loops. */
709 desc
= get_simple_loop_desc (loop
);
711 /* Check simpleness. */
712 if (!desc
->simple_p
|| desc
->assumptions
)
716 ";; Unable to prove that the number of iterations "
717 "can be counted in runtime\n");
721 if (desc
->const_iter
)
724 fprintf (dump_file
, ";; Loop iterates constant times\n");
728 /* Check whether the loop rolls. */
729 if ((get_estimated_loop_iterations (loop
, &iterations
)
730 || get_likely_max_loop_iterations (loop
, &iterations
))
731 && wi::ltu_p (iterations
, 2 * nunroll
))
734 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
738 /* Success; now force nunroll to be power of 2, as code-gen
739 requires it, we are unable to cope with overflows in
740 computation of number of iterations. */
741 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
744 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
745 loop
->lpt_decision
.times
= i
- 1;
748 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
749 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
750 and NULL is returned instead. */
753 split_edge_and_insert (edge e
, rtx_insn
*insns
)
760 emit_insn_after (insns
, BB_END (bb
));
762 /* ??? We used to assume that INSNS can contain control flow insns, and
763 that we had to try to find sub basic blocks in BB to maintain a valid
764 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
765 and call break_superblocks when going out of cfglayout mode. But it
766 turns out that this never happens; and that if it does ever happen,
767 the verify_flow_info at the end of the RTL loop passes would fail.
769 There are two reasons why we expected we could have control flow insns
770 in INSNS. The first is when a comparison has to be done in parts, and
771 the second is when the number of iterations is computed for loops with
772 the number of iterations known at runtime. In both cases, test cases
773 to get control flow in INSNS appear to be impossible to construct:
775 * If do_compare_rtx_and_jump needs several branches to do comparison
776 in a mode that needs comparison by parts, we cannot analyze the
777 number of iterations of the loop, and we never get to unrolling it.
779 * The code in expand_divmod that was suspected to cause creation of
780 branching code seems to be only accessed for signed division. The
781 divisions used by # of iterations analysis are always unsigned.
782 Problems might arise on architectures that emits branching code
783 for some operations that may appear in the unroller (especially
784 for division), but we have no such architectures.
786 Considering all this, it was decided that we should for now assume
787 that INSNS can in theory contain control flow insns, but in practice
788 it never does. So we don't handle the theoretical case, and should
789 a real failure ever show up, we have a pretty good clue for how to
795 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
796 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
797 in order to create a jump. */
800 compare_and_jump_seq (rtx op0
, rtx op1
, enum rtx_code comp
,
801 rtx_code_label
*label
, profile_probability prob
,
809 mode
= GET_MODE (op0
);
810 if (mode
== VOIDmode
)
811 mode
= GET_MODE (op1
);
814 if (GET_MODE_CLASS (mode
) == MODE_CC
)
816 /* A hack -- there seems to be no easy generic way how to make a
817 conditional jump from a ccmode comparison. */
819 cond
= XEXP (SET_SRC (pc_set (cinsn
)), 0);
820 gcc_assert (GET_CODE (cond
) == comp
);
821 gcc_assert (rtx_equal_p (op0
, XEXP (cond
, 0)));
822 gcc_assert (rtx_equal_p (op1
, XEXP (cond
, 1)));
823 emit_jump_insn (copy_insn (PATTERN (cinsn
)));
824 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
825 JUMP_LABEL (jump
) = JUMP_LABEL (cinsn
);
826 LABEL_NUSES (JUMP_LABEL (jump
))++;
827 redirect_jump (jump
, label
, 0);
833 op0
= force_operand (op0
, NULL_RTX
);
834 op1
= force_operand (op1
, NULL_RTX
);
835 do_compare_rtx_and_jump (op0
, op1
, comp
, 0,
836 mode
, NULL_RTX
, NULL
, label
,
837 profile_probability::uninitialized ());
838 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
839 jump
->set_jump_target (label
);
840 LABEL_NUSES (label
)++;
842 if (prob
.initialized_p ())
843 add_reg_br_prob_note (jump
, prob
);
851 /* Unroll LOOP for which we are able to count number of iterations in
852 runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
853 power of two. The transformation does this (with some extra care
856 for (i = 0; i < n; i++)
859 ==> (LOOP->LPT_DECISION.TIMES == 3)
884 unroll_loop_runtime_iterations (class loop
*loop
)
886 rtx old_niter
, niter
, tmp
;
887 rtx_insn
*init_code
, *branch_code
;
889 profile_probability p
;
890 basic_block preheader
, *body
, swtch
, ezc_swtch
= NULL
;
892 profile_count iter_count
, new_count
;
895 bool extra_zero_check
, last_may_exit
;
896 unsigned max_unroll
= loop
->lpt_decision
.times
;
897 class niter_desc
*desc
= get_simple_loop_desc (loop
);
898 bool exit_at_end
= loop_exit_at_end_p (loop
);
899 struct opt_info
*opt_info
= NULL
;
902 if (flag_split_ivs_in_unroller
903 || flag_variable_expansion_in_unroller
)
904 opt_info
= analyze_insns_in_loop (loop
);
906 /* Remember blocks whose dominators will have to be updated. */
907 auto_vec
<basic_block
> dom_bbs
;
909 body
= get_loop_body (loop
);
910 for (i
= 0; i
< loop
->num_nodes
; i
++)
912 vec
<basic_block
> ldom
;
915 ldom
= get_dominated_by (CDI_DOMINATORS
, body
[i
]);
916 FOR_EACH_VEC_ELT (ldom
, j
, bb
)
917 if (!flow_bb_inside_loop_p (loop
, bb
))
918 dom_bbs
.safe_push (bb
);
926 /* Leave exit in first copy (for explanation why see comment in
927 unroll_loop_constant_iterations). */
929 n_peel
= max_unroll
- 1;
930 extra_zero_check
= true;
931 last_may_exit
= false;
935 /* Leave exit in last copy (for explanation why see comment in
936 unroll_loop_constant_iterations). */
937 may_exit_copy
= max_unroll
;
939 extra_zero_check
= false;
940 last_may_exit
= true;
943 /* Get expression for number of iterations. */
945 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
946 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
948 emit_move_insn (niter
, tmp
);
950 /* For loops that exit at end and whose number of iterations is reliable,
951 add one to niter to account for first pass through loop body before
952 reaching exit test. */
953 if (exit_at_end
&& !desc
->noloop_assumptions
)
955 niter
= expand_simple_binop (desc
->mode
, PLUS
,
957 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
961 /* Count modulo by ANDing it with max_unroll; we use the fact that
962 the number of unrollings is a power of two, and thus this is correct
963 even if there is overflow in the computation. */
964 niter
= expand_simple_binop (desc
->mode
, AND
,
965 niter
, gen_int_mode (max_unroll
, desc
->mode
),
966 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
968 init_code
= get_insns ();
970 unshare_all_rtl_in_chain (init_code
);
972 /* Precondition the loop. */
973 split_edge_and_insert (loop_preheader_edge (loop
), init_code
);
975 auto_vec
<edge
> remove_edges
;
977 auto_sbitmap
wont_exit (max_unroll
+ 2);
979 if (extra_zero_check
|| desc
->noloop_assumptions
)
981 /* Peel the first copy of loop body. Leave the exit test if the number
982 of iterations is not reliable. Also record the place of the extra zero
984 bitmap_clear (wont_exit
);
985 if (!desc
->noloop_assumptions
)
986 bitmap_set_bit (wont_exit
, 1);
987 ezc_swtch
= loop_preheader_edge (loop
)->src
;
988 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
989 1, wont_exit
, desc
->out_edge
,
991 DLTHE_FLAG_UPDATE_FREQ
);
995 /* Record the place where switch will be built for preconditioning. */
996 swtch
= split_edge (loop_preheader_edge (loop
));
998 /* Compute count increments for each switch block and initialize
999 innermost switch block. Switch blocks and peeled loop copies are built
1000 from innermost outward. */
1001 iter_count
= new_count
= swtch
->count
.apply_scale (1, max_unroll
+ 1);
1002 swtch
->count
= new_count
;
1004 for (i
= 0; i
< n_peel
; i
++)
1006 /* Peel the copy. */
1007 bitmap_clear (wont_exit
);
1008 if (i
!= n_peel
- 1 || !last_may_exit
)
1009 bitmap_set_bit (wont_exit
, 1);
1010 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1011 1, wont_exit
, desc
->out_edge
,
1013 DLTHE_FLAG_UPDATE_FREQ
);
1016 /* Create item for switch. */
1017 j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
1018 p
= profile_probability::always ().apply_scale (1, i
+ 2);
1020 preheader
= split_edge (loop_preheader_edge (loop
));
1021 /* Add in count of edge from switch block. */
1022 preheader
->count
+= iter_count
;
1023 branch_code
= compare_and_jump_seq (copy_rtx (niter
),
1024 gen_int_mode (j
, desc
->mode
), EQ
,
1025 block_label (preheader
), p
, NULL
);
1027 /* We rely on the fact that the compare and jump cannot be optimized out,
1028 and hence the cfg we create is correct. */
1029 gcc_assert (branch_code
!= NULL_RTX
);
1031 swtch
= split_edge_and_insert (single_pred_edge (swtch
), branch_code
);
1032 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1033 single_succ_edge (swtch
)->probability
= p
.invert ();
1034 new_count
+= iter_count
;
1035 swtch
->count
= new_count
;
1036 e
= make_edge (swtch
, preheader
,
1037 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1041 if (extra_zero_check
)
1043 /* Add branch for zero iterations. */
1044 p
= profile_probability::always ().apply_scale (1, max_unroll
+ 1);
1046 preheader
= split_edge (loop_preheader_edge (loop
));
1047 /* Recompute count adjustments since initial peel copy may
1048 have exited and reduced those values that were computed above. */
1049 iter_count
= swtch
->count
.apply_scale (1, max_unroll
+ 1);
1050 /* Add in count of edge from switch block. */
1051 preheader
->count
+= iter_count
;
1052 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
1053 block_label (preheader
), p
,
1055 gcc_assert (branch_code
!= NULL_RTX
);
1057 swtch
= split_edge_and_insert (single_succ_edge (swtch
), branch_code
);
1058 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1059 single_succ_edge (swtch
)->probability
= p
.invert ();
1060 e
= make_edge (swtch
, preheader
,
1061 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1065 /* Recount dominators for outer blocks. */
1066 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, false);
1068 /* And unroll loop. */
1070 bitmap_ones (wont_exit
);
1071 bitmap_clear_bit (wont_exit
, may_exit_copy
);
1072 opt_info_start_duplication (opt_info
);
1074 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1076 wont_exit
, desc
->out_edge
,
1078 DLTHE_FLAG_UPDATE_FREQ
1080 ? DLTHE_RECORD_COPY_NUMBER
1086 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
1087 free_opt_info (opt_info
);
1092 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
1093 /* Find a new in and out edge; they are in the last copy we have
1096 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
1098 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
1099 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
1103 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
1104 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
1108 /* Remove the edges. */
1109 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
1112 /* We must be careful when updating the number of iterations due to
1113 preconditioning and the fact that the value must be valid at entry
1114 of the loop. After passing through the above code, we see that
1115 the correct new number of iterations is this: */
1116 gcc_assert (!desc
->const_iter
);
1118 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
,
1119 gen_int_mode (max_unroll
+ 1, desc
->mode
));
1120 loop
->nb_iterations_upper_bound
1121 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
1122 if (loop
->any_estimate
)
1123 loop
->nb_iterations_estimate
1124 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
1125 if (loop
->any_likely_upper_bound
)
1126 loop
->nb_iterations_likely_upper_bound
1127 = wi::udiv_trunc (loop
->nb_iterations_likely_upper_bound
, max_unroll
+ 1);
1131 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1132 desc
->noloop_assumptions
= NULL_RTX
;
1133 --loop
->nb_iterations_upper_bound
;
1134 if (loop
->any_estimate
1135 && loop
->nb_iterations_estimate
!= 0)
1136 --loop
->nb_iterations_estimate
;
1138 loop
->any_estimate
= false;
1139 if (loop
->any_likely_upper_bound
1140 && loop
->nb_iterations_likely_upper_bound
!= 0)
1141 --loop
->nb_iterations_likely_upper_bound
;
1143 loop
->any_likely_upper_bound
= false;
1148 ";; Unrolled loop %d times, counting # of iterations "
1149 "in runtime, %i insns\n",
1150 max_unroll
, num_loop_insns (loop
));
1153 /* Decide whether to unroll LOOP stupidly and how much. */
1155 decide_unroll_stupid (class loop
*loop
, int flags
)
1157 unsigned nunroll
, nunroll_by_av
, i
;
1158 class niter_desc
*desc
;
1159 widest_int iterations
;
1161 /* If we were not asked to unroll this loop, just return back silently. */
1162 if (!(flags
& UAP_UNROLL_ALL
) && !loop
->unroll
)
1165 if (dump_enabled_p ())
1166 dump_printf (MSG_NOTE
, "considering unrolling loop stupidly\n");
1168 /* nunroll = total number of copies of the original loop body in
1169 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1170 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
1172 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
1173 if (nunroll
> nunroll_by_av
)
1174 nunroll
= nunroll_by_av
;
1175 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
1176 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
1178 if (targetm
.loop_unroll_adjust
)
1179 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
1181 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
1182 nunroll
= loop
->unroll
;
1184 /* Skip big loops. */
1188 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1192 /* Check for simple loops. */
1193 desc
= get_simple_loop_desc (loop
);
1195 /* Check simpleness. */
1196 if (desc
->simple_p
&& !desc
->assumptions
)
1199 fprintf (dump_file
, ";; Loop is simple\n");
1203 /* Do not unroll loops with branches inside -- it increases number
1205 TODO: this heuristic needs tunning; call inside the loop body
1206 is also relatively good reason to not unroll. */
1207 if (num_loop_branches (loop
) > 1)
1210 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1214 /* Check whether the loop rolls. */
1215 if ((get_estimated_loop_iterations (loop
, &iterations
)
1216 || get_likely_max_loop_iterations (loop
, &iterations
))
1217 && wi::ltu_p (iterations
, 2 * nunroll
))
1220 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1224 /* Success. Now force nunroll to be power of 2, as it seems that this
1225 improves results (partially because of better alignments, partially
1226 because of some dark magic). */
1227 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1230 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1231 loop
->lpt_decision
.times
= i
- 1;
1234 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1239 ==> (LOOP->LPT_DECISION.TIMES == 3)
1253 unroll_loop_stupid (class loop
*loop
)
1255 unsigned nunroll
= loop
->lpt_decision
.times
;
1256 class niter_desc
*desc
= get_simple_loop_desc (loop
);
1257 struct opt_info
*opt_info
= NULL
;
1260 if (flag_split_ivs_in_unroller
1261 || flag_variable_expansion_in_unroller
)
1262 opt_info
= analyze_insns_in_loop (loop
);
1264 auto_sbitmap
wont_exit (nunroll
+ 1);
1265 bitmap_clear (wont_exit
);
1266 opt_info_start_duplication (opt_info
);
1268 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1271 DLTHE_FLAG_UPDATE_FREQ
1273 ? DLTHE_RECORD_COPY_NUMBER
1279 apply_opt_in_copies (opt_info
, nunroll
, true, true);
1280 free_opt_info (opt_info
);
1285 /* We indeed may get here provided that there are nontrivial assumptions
1286 for a loop to be really simple. We could update the counts, but the
1287 problem is that we are unable to decide which exit will be taken
1288 (not really true in case the number of iterations is constant,
1289 but no one will do anything with this information, so we do not
1291 desc
->simple_p
= false;
1295 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1296 nunroll
, num_loop_insns (loop
));
1299 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1300 Set *DEBUG_USES to the number of debug insns that reference the
1304 referenced_in_one_insn_in_loop_p (class loop
*loop
, rtx reg
,
1307 basic_block
*body
, bb
;
1312 body
= get_loop_body (loop
);
1313 for (i
= 0; i
< loop
->num_nodes
; i
++)
1317 FOR_BB_INSNS (bb
, insn
)
1318 if (!rtx_referenced_p (reg
, insn
))
1320 else if (DEBUG_INSN_P (insn
))
1322 else if (++count_ref
> 1)
1326 return (count_ref
== 1);
1329 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1332 reset_debug_uses_in_loop (class loop
*loop
, rtx reg
, int debug_uses
)
1334 basic_block
*body
, bb
;
1338 body
= get_loop_body (loop
);
1339 for (i
= 0; debug_uses
&& i
< loop
->num_nodes
; i
++)
1343 FOR_BB_INSNS (bb
, insn
)
1344 if (!DEBUG_INSN_P (insn
) || !rtx_referenced_p (reg
, insn
))
1348 validate_change (insn
, &INSN_VAR_LOCATION_LOC (insn
),
1349 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1357 /* Determine whether INSN contains an accumulator
1358 which can be expanded into separate copies,
1359 one for each copy of the LOOP body.
1361 for (i = 0 ; i < n; i++)
1375 Return NULL if INSN contains no opportunity for expansion of accumulator.
1376 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1377 information and return a pointer to it.
1380 static struct var_to_expand
*
1381 analyze_insn_to_expand_var (class loop
*loop
, rtx_insn
*insn
)
1384 struct var_to_expand
*ves
;
1389 set
= single_set (insn
);
1393 dest
= SET_DEST (set
);
1394 src
= SET_SRC (set
);
1395 code
= GET_CODE (src
);
1397 if (code
!= PLUS
&& code
!= MINUS
&& code
!= MULT
&& code
!= FMA
)
1400 if (FLOAT_MODE_P (GET_MODE (dest
)))
1402 if (!flag_associative_math
)
1404 /* In the case of FMA, we're also changing the rounding. */
1405 if (code
== FMA
&& !flag_unsafe_math_optimizations
)
1409 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1410 in MD. But if there is no optab to generate the insn, we cannot
1411 perform the variable expansion. This can happen if an MD provides
1412 an insn but not a named pattern to generate it, for example to avoid
1413 producing code that needs additional mode switches like for x87/mmx.
1415 So we check have_insn_for which looks for an optab for the operation
1416 in SRC. If it doesn't exist, we can't perform the expansion even
1417 though INSN is valid. */
1418 if (!have_insn_for (code
, GET_MODE (src
)))
1422 && !(GET_CODE (dest
) == SUBREG
1423 && REG_P (SUBREG_REG (dest
))))
1426 /* Find the accumulator use within the operation. */
1429 /* We only support accumulation via FMA in the ADD position. */
1430 if (!rtx_equal_p (dest
, XEXP (src
, 2)))
1434 else if (rtx_equal_p (dest
, XEXP (src
, 0)))
1436 else if (rtx_equal_p (dest
, XEXP (src
, 1)))
1438 /* The method of expansion that we are using; which includes the
1439 initialization of the expansions with zero and the summation of
1440 the expansions at the end of the computation will yield wrong
1441 results for (x = something - x) thus avoid using it in that case. */
1449 /* It must not otherwise be used. */
1452 if (rtx_referenced_p (dest
, XEXP (src
, 0))
1453 || rtx_referenced_p (dest
, XEXP (src
, 1)))
1456 else if (rtx_referenced_p (dest
, XEXP (src
, 1 - accum_pos
)))
1459 /* It must be used in exactly one insn. */
1460 if (!referenced_in_one_insn_in_loop_p (loop
, dest
, &debug_uses
))
1465 fprintf (dump_file
, "\n;; Expanding Accumulator ");
1466 print_rtl (dump_file
, dest
);
1467 fprintf (dump_file
, "\n");
1471 /* Instead of resetting the debug insns, we could replace each
1472 debug use in the loop with the sum or product of all expanded
1473 accumulators. Since we'll only know of all expansions at the
1474 end, we'd have to keep track of which vars_to_expand a debug
1475 insn in the loop references, take note of each copy of the
1476 debug insn during unrolling, and when it's all done, compute
1477 the sum or product of each variable and adjust the original
1478 debug insn and each copy thereof. What a pain! */
1479 reset_debug_uses_in_loop (loop
, dest
, debug_uses
);
1481 /* Record the accumulator to expand. */
1482 ves
= XNEW (struct var_to_expand
);
1484 ves
->reg
= copy_rtx (dest
);
1485 ves
->var_expansions
.create (1);
1487 ves
->op
= GET_CODE (src
);
1488 ves
->expansion_count
= 0;
1489 ves
->reuse_expansion
= 0;
1493 /* Determine whether there is an induction variable in INSN that
1494 we would like to split during unrolling.
1514 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1515 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1518 static struct iv_to_split
*
1519 analyze_iv_to_split_insn (rtx_insn
*insn
)
1523 struct iv_to_split
*ivts
;
1524 scalar_int_mode mode
;
1527 /* For now we just split the basic induction variables. Later this may be
1528 extended for example by selecting also addresses of memory references. */
1529 set
= single_set (insn
);
1533 dest
= SET_DEST (set
);
1534 if (!REG_P (dest
) || !is_a
<scalar_int_mode
> (GET_MODE (dest
), &mode
))
1537 if (!biv_p (insn
, mode
, dest
))
1540 ok
= iv_analyze_result (insn
, dest
, &iv
);
1542 /* This used to be an assert under the assumption that if biv_p returns
1543 true that iv_analyze_result must also return true. However, that
1544 assumption is not strictly correct as evidenced by pr25569.
1546 Returning NULL when iv_analyze_result returns false is safe and
1547 avoids the problems in pr25569 until the iv_analyze_* routines
1548 can be fixed, which is apparently hard and time consuming
1549 according to their author. */
1553 if (iv
.step
== const0_rtx
1554 || iv
.mode
!= iv
.extend_mode
)
1557 /* Record the insn to split. */
1558 ivts
= XNEW (struct iv_to_split
);
1560 ivts
->orig_var
= dest
;
1561 ivts
->base_var
= NULL_RTX
;
1562 ivts
->step
= iv
.step
;
1568 /* Determines which of insns in LOOP can be optimized.
1569 Return a OPT_INFO struct with the relevant hash tables filled
1570 with all insns to be optimized. The FIRST_NEW_BLOCK field
1571 is undefined for the return value. */
1573 static struct opt_info
*
1574 analyze_insns_in_loop (class loop
*loop
)
1576 basic_block
*body
, bb
;
1578 struct opt_info
*opt_info
= XCNEW (struct opt_info
);
1580 struct iv_to_split
*ivts
= NULL
;
1581 struct var_to_expand
*ves
= NULL
;
1582 iv_to_split
**slot1
;
1583 var_to_expand
**slot2
;
1584 vec
<edge
> edges
= get_loop_exit_edges (loop
);
1586 bool can_apply
= false;
1588 iv_analysis_loop_init (loop
);
1590 body
= get_loop_body (loop
);
1592 if (flag_split_ivs_in_unroller
)
1594 opt_info
->insns_to_split
1595 = new hash_table
<iv_split_hasher
> (5 * loop
->num_nodes
);
1596 opt_info
->iv_to_split_head
= NULL
;
1597 opt_info
->iv_to_split_tail
= &opt_info
->iv_to_split_head
;
1600 /* Record the loop exit bb and loop preheader before the unrolling. */
1601 opt_info
->loop_preheader
= loop_preheader_edge (loop
)->src
;
1603 if (edges
.length () == 1)
1606 if (!(exit
->flags
& EDGE_COMPLEX
))
1608 opt_info
->loop_exit
= split_edge (exit
);
1613 if (flag_variable_expansion_in_unroller
1616 opt_info
->insns_with_var_to_expand
1617 = new hash_table
<var_expand_hasher
> (5 * loop
->num_nodes
);
1618 opt_info
->var_to_expand_head
= NULL
;
1619 opt_info
->var_to_expand_tail
= &opt_info
->var_to_expand_head
;
1622 for (i
= 0; i
< loop
->num_nodes
; i
++)
1625 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1628 FOR_BB_INSNS (bb
, insn
)
1633 if (opt_info
->insns_to_split
)
1634 ivts
= analyze_iv_to_split_insn (insn
);
1638 slot1
= opt_info
->insns_to_split
->find_slot (ivts
, INSERT
);
1639 gcc_assert (*slot1
== NULL
);
1641 *opt_info
->iv_to_split_tail
= ivts
;
1642 opt_info
->iv_to_split_tail
= &ivts
->next
;
1646 if (opt_info
->insns_with_var_to_expand
)
1647 ves
= analyze_insn_to_expand_var (loop
, insn
);
1651 slot2
= opt_info
->insns_with_var_to_expand
->find_slot (ves
, INSERT
);
1652 gcc_assert (*slot2
== NULL
);
1654 *opt_info
->var_to_expand_tail
= ves
;
1655 opt_info
->var_to_expand_tail
= &ves
->next
;
1665 /* Called just before loop duplication. Records start of duplicated area
1669 opt_info_start_duplication (struct opt_info
*opt_info
)
1672 opt_info
->first_new_block
= last_basic_block_for_fn (cfun
);
1675 /* Determine the number of iterations between initialization of the base
1676 variable and the current copy (N_COPY). N_COPIES is the total number
1677 of newly created copies. UNROLLING is true if we are unrolling
1678 (not peeling) the loop. */
1681 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1685 /* If we are unrolling, initialization is done in the original loop
1691 /* If we are peeling, the copy in that the initialization occurs has
1692 number 1. The original loop (number 0) is the last. */
1700 /* Allocate basic variable for the induction variable chain. */
1703 allocate_basic_variable (struct iv_to_split
*ivts
)
1705 rtx expr
= SET_SRC (single_set (ivts
->insn
));
1707 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1710 /* Insert initialization of basic variable of IVTS before INSN, taking
1711 the initial value from INSN. */
1714 insert_base_initialization (struct iv_to_split
*ivts
, rtx_insn
*insn
)
1716 rtx expr
= copy_rtx (SET_SRC (single_set (insn
)));
1720 expr
= force_operand (expr
, ivts
->base_var
);
1721 if (expr
!= ivts
->base_var
)
1722 emit_move_insn (ivts
->base_var
, expr
);
1726 emit_insn_before (seq
, insn
);
1729 /* Replace the use of induction variable described in IVTS in INSN
1730 by base variable + DELTA * step. */
1733 split_iv (struct iv_to_split
*ivts
, rtx_insn
*insn
, unsigned delta
)
1735 rtx expr
, *loc
, incr
, var
;
1737 machine_mode mode
= GET_MODE (ivts
->base_var
);
1740 /* Construct base + DELTA * step. */
1742 expr
= ivts
->base_var
;
1745 incr
= simplify_gen_binary (MULT
, mode
,
1746 copy_rtx (ivts
->step
),
1747 gen_int_mode (delta
, mode
));
1748 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1749 ivts
->base_var
, incr
);
1752 /* Figure out where to do the replacement. */
1753 loc
= &SET_SRC (single_set (insn
));
1755 /* If we can make the replacement right away, we're done. */
1756 if (validate_change (insn
, loc
, expr
, 0))
1759 /* Otherwise, force EXPR into a register and try again. */
1761 var
= gen_reg_rtx (mode
);
1762 expr
= force_operand (expr
, var
);
1764 emit_move_insn (var
, expr
);
1767 emit_insn_before (seq
, insn
);
1769 if (validate_change (insn
, loc
, var
, 0))
1772 /* The last chance. Try recreating the assignment in insn
1773 completely from scratch. */
1774 set
= single_set (insn
);
1779 src
= copy_rtx (SET_SRC (set
));
1780 dest
= copy_rtx (SET_DEST (set
));
1781 src
= force_operand (src
, dest
);
1783 emit_move_insn (dest
, src
);
1787 emit_insn_before (seq
, insn
);
1792 /* Return one expansion of the accumulator recorded in struct VE. */
1795 get_expansion (struct var_to_expand
*ve
)
1799 if (ve
->reuse_expansion
== 0)
1802 reg
= ve
->var_expansions
[ve
->reuse_expansion
- 1];
1804 if (ve
->var_expansions
.length () == (unsigned) ve
->reuse_expansion
)
1805 ve
->reuse_expansion
= 0;
1807 ve
->reuse_expansion
++;
1813 /* Given INSN replace the uses of the accumulator recorded in VE
1814 with a new register. */
1817 expand_var_during_unrolling (struct var_to_expand
*ve
, rtx_insn
*insn
)
1820 bool really_new_expansion
= false;
1822 set
= single_set (insn
);
1825 /* Generate a new register only if the expansion limit has not been
1826 reached. Else reuse an already existing expansion. */
1827 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS
) > ve
->expansion_count
)
1829 really_new_expansion
= true;
1830 new_reg
= gen_reg_rtx (GET_MODE (ve
->reg
));
1833 new_reg
= get_expansion (ve
);
1835 validate_replace_rtx_group (SET_DEST (set
), new_reg
, insn
);
1836 if (apply_change_group ())
1837 if (really_new_expansion
)
1839 ve
->var_expansions
.safe_push (new_reg
);
1840 ve
->expansion_count
++;
1844 /* Initialize the variable expansions in loop preheader. PLACE is the
1845 loop-preheader basic block where the initialization of the
1846 expansions should take place. The expansions are initialized with
1847 (-0) when the operation is plus or minus to honor sign zero. This
1848 way we can prevent cases where the sign of the final result is
1849 effected by the sign of the expansion. Here is an example to
1852 for (i = 0 ; i < n; i++)
1866 When SUM is initialized with -zero and SOMETHING is also -zero; the
1867 final result of sum should be -zero thus the expansions sum1 and sum2
1868 should be initialized with -zero as well (otherwise we will get +zero
1869 as the final result). */
1872 insert_var_expansion_initialization (struct var_to_expand
*ve
,
1878 machine_mode mode
= GET_MODE (ve
->reg
);
1879 bool honor_signed_zero_p
= HONOR_SIGNED_ZEROS (mode
);
1881 if (ve
->var_expansions
.length () == 0)
1888 /* Note that we only accumulate FMA via the ADD operand. */
1891 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1893 if (honor_signed_zero_p
)
1894 zero_init
= simplify_gen_unary (NEG
, mode
, CONST0_RTX (mode
), mode
);
1896 zero_init
= CONST0_RTX (mode
);
1897 emit_move_insn (var
, zero_init
);
1902 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1904 zero_init
= CONST1_RTX (GET_MODE (var
));
1905 emit_move_insn (var
, zero_init
);
1916 emit_insn_after (seq
, BB_END (place
));
1919 /* Combine the variable expansions at the loop exit. PLACE is the
1920 loop exit basic block where the summation of the expansions should
1924 combine_var_copies_in_loop_exit (struct var_to_expand
*ve
, basic_block place
)
1928 rtx_insn
*seq
, *insn
;
1931 if (ve
->var_expansions
.length () == 0)
1934 /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1935 it both here and as the destination of the assignment. */
1936 sum
= copy_rtx (sum
);
1941 /* Note that we only accumulate FMA via the ADD operand. */
1944 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1945 sum
= simplify_gen_binary (PLUS
, GET_MODE (ve
->reg
), var
, sum
);
1949 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1950 sum
= simplify_gen_binary (MULT
, GET_MODE (ve
->reg
), var
, sum
);
1957 expr
= force_operand (sum
, ve
->reg
);
1958 if (expr
!= ve
->reg
)
1959 emit_move_insn (ve
->reg
, expr
);
1963 insn
= BB_HEAD (place
);
1964 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
1965 insn
= NEXT_INSN (insn
);
1967 emit_insn_after (seq
, insn
);
1970 /* Strip away REG_EQUAL notes for IVs we're splitting.
1972 Updating REG_EQUAL notes for IVs we split is tricky: We
1973 cannot tell until after unrolling, DF-rescanning, and liveness
1974 updating, whether an EQ_USE is reached by the split IV while
1975 the IV reg is still live. See PR55006.
1977 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1978 because RTL loop-iv requires us to defer rescanning insns and
1979 any notes attached to them. So resort to old techniques... */
1982 maybe_strip_eq_note_for_split_iv (struct opt_info
*opt_info
, rtx_insn
*insn
)
1984 struct iv_to_split
*ivts
;
1985 rtx note
= find_reg_equal_equiv_note (insn
);
1988 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1989 if (reg_mentioned_p (ivts
->orig_var
, note
))
1991 remove_note (insn
, note
);
1996 /* Apply loop optimizations in loop copies using the
1997 data which gathered during the unrolling. Structure
1998 OPT_INFO record that data.
2000 UNROLLING is true if we unrolled (not peeled) the loop.
2001 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2002 the loop (as it should happen in complete unrolling, but not in ordinary
2003 peeling of the loop). */
2006 apply_opt_in_copies (struct opt_info
*opt_info
,
2007 unsigned n_copies
, bool unrolling
,
2008 bool rewrite_original_loop
)
2011 basic_block bb
, orig_bb
;
2012 rtx_insn
*insn
, *orig_insn
, *next
;
2013 struct iv_to_split ivts_templ
, *ivts
;
2014 struct var_to_expand ve_templ
, *ves
;
2016 /* Sanity check -- we need to put initialization in the original loop
2018 gcc_assert (!unrolling
|| rewrite_original_loop
);
2020 /* Allocate the basic variables (i0). */
2021 if (opt_info
->insns_to_split
)
2022 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
2023 allocate_basic_variable (ivts
);
2025 for (i
= opt_info
->first_new_block
;
2026 i
< (unsigned) last_basic_block_for_fn (cfun
);
2029 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2030 orig_bb
= get_bb_original (bb
);
2032 /* bb->aux holds position in copy sequence initialized by
2033 duplicate_loop_to_header_edge. */
2034 delta
= determine_split_iv_delta ((size_t)bb
->aux
, n_copies
,
2037 orig_insn
= BB_HEAD (orig_bb
);
2038 FOR_BB_INSNS_SAFE (bb
, insn
, next
)
2041 || (DEBUG_BIND_INSN_P (insn
)
2042 && INSN_VAR_LOCATION_DECL (insn
)
2043 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn
)) == LABEL_DECL
))
2046 while (!INSN_P (orig_insn
)
2047 || (DEBUG_BIND_INSN_P (orig_insn
)
2048 && INSN_VAR_LOCATION_DECL (orig_insn
)
2049 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn
))
2051 orig_insn
= NEXT_INSN (orig_insn
);
2053 ivts_templ
.insn
= orig_insn
;
2054 ve_templ
.insn
= orig_insn
;
2056 /* Apply splitting iv optimization. */
2057 if (opt_info
->insns_to_split
)
2059 maybe_strip_eq_note_for_split_iv (opt_info
, insn
);
2061 ivts
= opt_info
->insns_to_split
->find (&ivts_templ
);
2065 gcc_assert (GET_CODE (PATTERN (insn
))
2066 == GET_CODE (PATTERN (orig_insn
)));
2069 insert_base_initialization (ivts
, insn
);
2070 split_iv (ivts
, insn
, delta
);
2073 /* Apply variable expansion optimization. */
2074 if (unrolling
&& opt_info
->insns_with_var_to_expand
)
2076 ves
= (struct var_to_expand
*)
2077 opt_info
->insns_with_var_to_expand
->find (&ve_templ
);
2080 gcc_assert (GET_CODE (PATTERN (insn
))
2081 == GET_CODE (PATTERN (orig_insn
)));
2082 expand_var_during_unrolling (ves
, insn
);
2085 orig_insn
= NEXT_INSN (orig_insn
);
2089 if (!rewrite_original_loop
)
2092 /* Initialize the variable expansions in the loop preheader
2093 and take care of combining them at the loop exit. */
2094 if (opt_info
->insns_with_var_to_expand
)
2096 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2097 insert_var_expansion_initialization (ves
, opt_info
->loop_preheader
);
2098 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2099 combine_var_copies_in_loop_exit (ves
, opt_info
->loop_exit
);
2102 /* Rewrite also the original loop body. Find them as originals of the blocks
2103 in the last copied iteration, i.e. those that have
2104 get_bb_copy (get_bb_original (bb)) == bb. */
2105 for (i
= opt_info
->first_new_block
;
2106 i
< (unsigned) last_basic_block_for_fn (cfun
);
2109 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2110 orig_bb
= get_bb_original (bb
);
2111 if (get_bb_copy (orig_bb
) != bb
)
2114 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
2115 for (orig_insn
= BB_HEAD (orig_bb
);
2116 orig_insn
!= NEXT_INSN (BB_END (bb
));
2119 next
= NEXT_INSN (orig_insn
);
2121 if (!INSN_P (orig_insn
))
2124 ivts_templ
.insn
= orig_insn
;
2125 if (opt_info
->insns_to_split
)
2127 maybe_strip_eq_note_for_split_iv (opt_info
, orig_insn
);
2129 ivts
= (struct iv_to_split
*)
2130 opt_info
->insns_to_split
->find (&ivts_templ
);
2134 insert_base_initialization (ivts
, orig_insn
);
2135 split_iv (ivts
, orig_insn
, delta
);
2144 /* Release OPT_INFO. */
2147 free_opt_info (struct opt_info
*opt_info
)
2149 delete opt_info
->insns_to_split
;
2150 opt_info
->insns_to_split
= NULL
;
2151 if (opt_info
->insns_with_var_to_expand
)
2153 struct var_to_expand
*ves
;
2155 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2156 ves
->var_expansions
.release ();
2157 delete opt_info
->insns_with_var_to_expand
;
2158 opt_info
->insns_with_var_to_expand
= NULL
;