2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
28 #include "double-int.h"
35 #include "hard-reg-set.h"
41 #include "dominance.h"
44 #include "basic-block.h"
47 #include "insn-codes.h"
50 #include "hash-table.h"
55 /* This pass performs loop unrolling. We only perform this
56 optimization on innermost loops (with single exception) because
57 the impact on performance is greatest here, and we want to avoid
58 unnecessary code size growth. The gain is caused by greater sequentiality
59 of code, better code to optimize for further passes and in some cases
60 by fewer testings of exit conditions. The main problem is code growth,
61 that impacts performance negatively due to effect of caches.
65 -- unrolling of loops that roll constant times; this is almost always
66 win, as we get rid of exit condition tests.
67 -- unrolling of loops that roll number of times that we can compute
68 in runtime; we also get rid of exit condition tests here, but there
69 is the extra expense for calculating the number of iterations
70 -- simple unrolling of remaining loops; this is performed only if we
71 are asked to, as the gain is questionable in this case and often
72 it may even slow down the code
73 For more detailed descriptions of each of those, see comments at
74 appropriate function below.
76 There is a lot of parameters (defined and described in params.def) that
77 control how much we unroll.
79 ??? A great problem is that we don't have a good way how to determine
80 how many times we should unroll the loop; the experiments I have made
81 showed that this choice may affect performance in order of several %.
84 /* Information about induction variables to split. */
88 rtx_insn
*insn
; /* The insn in that the induction variable occurs. */
89 rtx orig_var
; /* The variable (register) for the IV before split. */
90 rtx base_var
; /* The variable on that the values in the further
91 iterations are based. */
92 rtx step
; /* Step of the induction variable. */
93 struct iv_to_split
*next
; /* Next entry in walking order. */
96 /* Information about accumulators to expand. */
100 rtx_insn
*insn
; /* The insn in that the variable expansion occurs. */
101 rtx reg
; /* The accumulator which is expanded. */
102 vec
<rtx
> var_expansions
; /* The copies of the accumulator which is expanded. */
103 struct var_to_expand
*next
; /* Next entry in walking order. */
104 enum rtx_code op
; /* The type of the accumulation - addition, subtraction
105 or multiplication. */
106 int expansion_count
; /* Count the number of expansions generated so far. */
107 int reuse_expansion
; /* The expansion we intend to reuse to expand
108 the accumulator. If REUSE_EXPANSION is 0 reuse
109 the original accumulator. Else use
110 var_expansions[REUSE_EXPANSION - 1]. */
113 /* Hashtable helper for iv_to_split. */
115 struct iv_split_hasher
: typed_free_remove
<iv_to_split
>
117 typedef iv_to_split value_type
;
118 typedef iv_to_split compare_type
;
119 static inline hashval_t
hash (const value_type
*);
120 static inline bool equal (const value_type
*, const compare_type
*);
124 /* A hash function for information about insns to split. */
127 iv_split_hasher::hash (const value_type
*ivts
)
129 return (hashval_t
) INSN_UID (ivts
->insn
);
132 /* An equality functions for information about insns to split. */
135 iv_split_hasher::equal (const value_type
*i1
, const compare_type
*i2
)
137 return i1
->insn
== i2
->insn
;
140 /* Hashtable helper for iv_to_split. */
142 struct var_expand_hasher
: typed_free_remove
<var_to_expand
>
144 typedef var_to_expand value_type
;
145 typedef var_to_expand compare_type
;
146 static inline hashval_t
hash (const value_type
*);
147 static inline bool equal (const value_type
*, const compare_type
*);
150 /* Return a hash for VES. */
153 var_expand_hasher::hash (const value_type
*ves
)
155 return (hashval_t
) INSN_UID (ves
->insn
);
158 /* Return true if I1 and I2 refer to the same instruction. */
161 var_expand_hasher::equal (const value_type
*i1
, const compare_type
*i2
)
163 return i1
->insn
== i2
->insn
;
166 /* Information about optimization applied in
167 the unrolled loop. */
171 hash_table
<iv_split_hasher
> *insns_to_split
; /* A hashtable of insns to
173 struct iv_to_split
*iv_to_split_head
; /* The first iv to split. */
174 struct iv_to_split
**iv_to_split_tail
; /* Pointer to the tail of the list. */
175 hash_table
<var_expand_hasher
> *insns_with_var_to_expand
; /* A hashtable of
176 insns with accumulators to expand. */
177 struct var_to_expand
*var_to_expand_head
; /* The first var to expand. */
178 struct var_to_expand
**var_to_expand_tail
; /* Pointer to the tail of the list. */
179 unsigned first_new_block
; /* The first basic block that was
181 basic_block loop_exit
; /* The loop exit basic block. */
182 basic_block loop_preheader
; /* The loop preheader basic block. */
185 static void decide_unroll_stupid (struct loop
*, int);
186 static void decide_unroll_constant_iterations (struct loop
*, int);
187 static void decide_unroll_runtime_iterations (struct loop
*, int);
188 static void unroll_loop_stupid (struct loop
*);
189 static void decide_unrolling (int);
190 static void unroll_loop_constant_iterations (struct loop
*);
191 static void unroll_loop_runtime_iterations (struct loop
*);
192 static struct opt_info
*analyze_insns_in_loop (struct loop
*);
193 static void opt_info_start_duplication (struct opt_info
*);
194 static void apply_opt_in_copies (struct opt_info
*, unsigned, bool, bool);
195 static void free_opt_info (struct opt_info
*);
196 static struct var_to_expand
*analyze_insn_to_expand_var (struct loop
*, rtx_insn
*);
197 static bool referenced_in_one_insn_in_loop_p (struct loop
*, rtx
, int *);
198 static struct iv_to_split
*analyze_iv_to_split_insn (rtx_insn
*);
199 static void expand_var_during_unrolling (struct var_to_expand
*, rtx_insn
*);
200 static void insert_var_expansion_initialization (struct var_to_expand
*,
202 static void combine_var_copies_in_loop_exit (struct var_to_expand
*,
204 static rtx
get_expansion (struct var_to_expand
*);
206 /* Emit a message summarizing the unroll that will be
207 performed for LOOP, along with the loop's location LOCUS, if
208 appropriate given the dump or -fopt-info settings. */
211 report_unroll (struct loop
*loop
, location_t locus
)
213 int report_flags
= MSG_OPTIMIZED_LOCATIONS
| TDF_RTL
| TDF_DETAILS
;
215 if (loop
->lpt_decision
.decision
== LPT_NONE
)
218 if (!dump_enabled_p ())
221 dump_printf_loc (report_flags
, locus
,
222 "loop unrolled %d times",
223 loop
->lpt_decision
.times
);
225 dump_printf (report_flags
,
226 " (header execution count %d)",
227 (int)loop
->header
->count
);
229 dump_printf (report_flags
, "\n");
232 /* Decide whether unroll loops and how much. */
234 decide_unrolling (int flags
)
238 /* Scan the loops, inner ones first. */
239 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
241 loop
->lpt_decision
.decision
= LPT_NONE
;
242 location_t locus
= get_loop_location (loop
);
244 if (dump_enabled_p ())
245 dump_printf_loc (TDF_RTL
, locus
,
246 ";; *** Considering loop %d at BB %d for "
248 loop
->num
, loop
->header
->index
);
250 /* Do not peel cold areas. */
251 if (optimize_loop_for_size_p (loop
))
254 fprintf (dump_file
, ";; Not considering loop, cold area\n");
258 /* Can the loop be manipulated? */
259 if (!can_duplicate_loop_p (loop
))
263 ";; Not considering loop, cannot duplicate\n");
267 /* Skip non-innermost loops. */
271 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
275 loop
->ninsns
= num_loop_insns (loop
);
276 loop
->av_ninsns
= average_num_loop_insns (loop
);
278 /* Try transformations one by one in decreasing order of
281 decide_unroll_constant_iterations (loop
, flags
);
282 if (loop
->lpt_decision
.decision
== LPT_NONE
)
283 decide_unroll_runtime_iterations (loop
, flags
);
284 if (loop
->lpt_decision
.decision
== LPT_NONE
)
285 decide_unroll_stupid (loop
, flags
);
287 report_unroll (loop
, locus
);
293 unroll_loops (int flags
)
296 bool changed
= false;
298 /* Now decide rest of unrolling. */
299 decide_unrolling (flags
);
301 /* Scan the loops, inner ones first. */
302 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
304 /* And perform the appropriate transformations. */
305 switch (loop
->lpt_decision
.decision
)
307 case LPT_UNROLL_CONSTANT
:
308 unroll_loop_constant_iterations (loop
);
311 case LPT_UNROLL_RUNTIME
:
312 unroll_loop_runtime_iterations (loop
);
315 case LPT_UNROLL_STUPID
:
316 unroll_loop_stupid (loop
);
328 calculate_dominance_info (CDI_DOMINATORS
);
329 fix_loop_structure (NULL
);
335 /* Check whether exit of the LOOP is at the end of loop body. */
338 loop_exit_at_end_p (struct loop
*loop
)
340 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
343 /* We should never have conditional in latch block. */
344 gcc_assert (desc
->in_edge
->dest
!= loop
->header
);
346 if (desc
->in_edge
->dest
!= loop
->latch
)
349 /* Check that the latch is empty. */
350 FOR_BB_INSNS (loop
->latch
, insn
)
352 if (INSN_P (insn
) && active_insn_p (insn
))
359 /* Decide whether to unroll LOOP iterating constant number of times
363 decide_unroll_constant_iterations (struct loop
*loop
, int flags
)
365 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
366 struct niter_desc
*desc
;
367 widest_int iterations
;
369 if (!(flags
& UAP_UNROLL
))
371 /* We were not asked to, just return back silently. */
377 "\n;; Considering unrolling loop with constant "
378 "number of iterations\n");
380 /* nunroll = total number of copies of the original loop body in
381 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
382 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
384 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
385 if (nunroll
> nunroll_by_av
)
386 nunroll
= nunroll_by_av
;
387 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
388 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
390 if (targetm
.loop_unroll_adjust
)
391 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
393 /* Skip big loops. */
397 fprintf (dump_file
, ";; Not considering loop, is too big\n");
401 /* Check for simple loops. */
402 desc
= get_simple_loop_desc (loop
);
404 /* Check number of iterations. */
405 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
409 ";; Unable to prove that the loop iterates constant times\n");
413 /* Check whether the loop rolls enough to consider.
414 Consult also loop bounds and profile; in the case the loop has more
415 than one exit it may well loop less than determined maximal number
417 if (desc
->niter
< 2 * nunroll
418 || ((get_estimated_loop_iterations (loop
, &iterations
)
419 || get_max_loop_iterations (loop
, &iterations
))
420 && wi::ltu_p (iterations
, 2 * nunroll
)))
423 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
427 /* Success; now compute number of iterations to unroll. We alter
428 nunroll so that as few as possible copies of loop body are
429 necessary, while still not decreasing the number of unrollings
430 too much (at most by 1). */
431 best_copies
= 2 * nunroll
+ 10;
434 if (i
- 1 >= desc
->niter
)
437 for (; i
>= nunroll
- 1; i
--)
439 unsigned exit_mod
= desc
->niter
% (i
+ 1);
441 if (!loop_exit_at_end_p (loop
))
442 n_copies
= exit_mod
+ i
+ 1;
443 else if (exit_mod
!= (unsigned) i
444 || desc
->noloop_assumptions
!= NULL_RTX
)
445 n_copies
= exit_mod
+ i
+ 2;
449 if (n_copies
< best_copies
)
451 best_copies
= n_copies
;
456 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
457 loop
->lpt_decision
.times
= best_unroll
;
460 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
461 The transformation does this:
463 for (i = 0; i < 102; i++)
466 ==> (LOOP->LPT_DECISION.TIMES == 3)
480 unroll_loop_constant_iterations (struct loop
*loop
)
482 unsigned HOST_WIDE_INT niter
;
487 unsigned max_unroll
= loop
->lpt_decision
.times
;
488 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
489 bool exit_at_end
= loop_exit_at_end_p (loop
);
490 struct opt_info
*opt_info
= NULL
;
495 /* Should not get here (such loop should be peeled instead). */
496 gcc_assert (niter
> max_unroll
+ 1);
498 exit_mod
= niter
% (max_unroll
+ 1);
500 wont_exit
= sbitmap_alloc (max_unroll
+ 1);
501 bitmap_ones (wont_exit
);
503 auto_vec
<edge
> remove_edges
;
504 if (flag_split_ivs_in_unroller
505 || flag_variable_expansion_in_unroller
)
506 opt_info
= analyze_insns_in_loop (loop
);
510 /* The exit is not at the end of the loop; leave exit test
511 in the first copy, so that the loops that start with test
512 of exit condition have continuous body after unrolling. */
515 fprintf (dump_file
, ";; Condition at beginning of loop.\n");
517 /* Peel exit_mod iterations. */
518 bitmap_clear_bit (wont_exit
, 0);
519 if (desc
->noloop_assumptions
)
520 bitmap_clear_bit (wont_exit
, 1);
524 opt_info_start_duplication (opt_info
);
525 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
527 wont_exit
, desc
->out_edge
,
529 DLTHE_FLAG_UPDATE_FREQ
530 | (opt_info
&& exit_mod
> 1
531 ? DLTHE_RECORD_COPY_NUMBER
535 if (opt_info
&& exit_mod
> 1)
536 apply_opt_in_copies (opt_info
, exit_mod
, false, false);
538 desc
->noloop_assumptions
= NULL_RTX
;
539 desc
->niter
-= exit_mod
;
540 loop
->nb_iterations_upper_bound
-= exit_mod
;
541 if (loop
->any_estimate
542 && wi::leu_p (exit_mod
, loop
->nb_iterations_estimate
))
543 loop
->nb_iterations_estimate
-= exit_mod
;
545 loop
->any_estimate
= false;
548 bitmap_set_bit (wont_exit
, 1);
552 /* Leave exit test in last copy, for the same reason as above if
553 the loop tests the condition at the end of loop body. */
556 fprintf (dump_file
, ";; Condition at end of loop.\n");
558 /* We know that niter >= max_unroll + 2; so we do not need to care of
559 case when we would exit before reaching the loop. So just peel
560 exit_mod + 1 iterations. */
561 if (exit_mod
!= max_unroll
562 || desc
->noloop_assumptions
)
564 bitmap_clear_bit (wont_exit
, 0);
565 if (desc
->noloop_assumptions
)
566 bitmap_clear_bit (wont_exit
, 1);
568 opt_info_start_duplication (opt_info
);
569 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
571 wont_exit
, desc
->out_edge
,
573 DLTHE_FLAG_UPDATE_FREQ
574 | (opt_info
&& exit_mod
> 0
575 ? DLTHE_RECORD_COPY_NUMBER
579 if (opt_info
&& exit_mod
> 0)
580 apply_opt_in_copies (opt_info
, exit_mod
+ 1, false, false);
582 desc
->niter
-= exit_mod
+ 1;
583 loop
->nb_iterations_upper_bound
-= exit_mod
+ 1;
584 if (loop
->any_estimate
585 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_estimate
))
586 loop
->nb_iterations_estimate
-= exit_mod
+ 1;
588 loop
->any_estimate
= false;
589 desc
->noloop_assumptions
= NULL_RTX
;
591 bitmap_set_bit (wont_exit
, 0);
592 bitmap_set_bit (wont_exit
, 1);
595 bitmap_clear_bit (wont_exit
, max_unroll
);
598 /* Now unroll the loop. */
600 opt_info_start_duplication (opt_info
);
601 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
603 wont_exit
, desc
->out_edge
,
605 DLTHE_FLAG_UPDATE_FREQ
607 ? DLTHE_RECORD_COPY_NUMBER
613 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
614 free_opt_info (opt_info
);
621 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
622 /* Find a new in and out edge; they are in the last copy we have made. */
624 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
626 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
627 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
631 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
632 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
636 desc
->niter
/= max_unroll
+ 1;
637 loop
->nb_iterations_upper_bound
638 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
639 if (loop
->any_estimate
)
640 loop
->nb_iterations_estimate
641 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
642 desc
->niter_expr
= GEN_INT (desc
->niter
);
644 /* Remove the edges. */
645 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
650 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
651 max_unroll
, num_loop_insns (loop
));
654 /* Decide whether to unroll LOOP iterating runtime computable number of times
657 decide_unroll_runtime_iterations (struct loop
*loop
, int flags
)
659 unsigned nunroll
, nunroll_by_av
, i
;
660 struct niter_desc
*desc
;
661 widest_int iterations
;
663 if (!(flags
& UAP_UNROLL
))
665 /* We were not asked to, just return back silently. */
671 "\n;; Considering unrolling loop with runtime "
672 "computable number of iterations\n");
674 /* nunroll = total number of copies of the original loop body in
675 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
676 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
677 nunroll_by_av
= PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
678 if (nunroll
> nunroll_by_av
)
679 nunroll
= nunroll_by_av
;
680 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
681 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
683 if (targetm
.loop_unroll_adjust
)
684 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
686 /* Skip big loops. */
690 fprintf (dump_file
, ";; Not considering loop, is too big\n");
694 /* Check for simple loops. */
695 desc
= get_simple_loop_desc (loop
);
697 /* Check simpleness. */
698 if (!desc
->simple_p
|| desc
->assumptions
)
702 ";; Unable to prove that the number of iterations "
703 "can be counted in runtime\n");
707 if (desc
->const_iter
)
710 fprintf (dump_file
, ";; Loop iterates constant times\n");
714 /* Check whether the loop rolls. */
715 if ((get_estimated_loop_iterations (loop
, &iterations
)
716 || get_max_loop_iterations (loop
, &iterations
))
717 && wi::ltu_p (iterations
, 2 * nunroll
))
720 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
724 /* Success; now force nunroll to be power of 2, as we are unable to
725 cope with overflows in computation of number of iterations. */
726 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
729 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
730 loop
->lpt_decision
.times
= i
- 1;
733 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
734 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
735 and NULL is returned instead. */
738 split_edge_and_insert (edge e
, rtx_insn
*insns
)
745 emit_insn_after (insns
, BB_END (bb
));
747 /* ??? We used to assume that INSNS can contain control flow insns, and
748 that we had to try to find sub basic blocks in BB to maintain a valid
749 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
750 and call break_superblocks when going out of cfglayout mode. But it
751 turns out that this never happens; and that if it does ever happen,
752 the verify_flow_info at the end of the RTL loop passes would fail.
754 There are two reasons why we expected we could have control flow insns
755 in INSNS. The first is when a comparison has to be done in parts, and
756 the second is when the number of iterations is computed for loops with
757 the number of iterations known at runtime. In both cases, test cases
758 to get control flow in INSNS appear to be impossible to construct:
760 * If do_compare_rtx_and_jump needs several branches to do comparison
761 in a mode that needs comparison by parts, we cannot analyze the
762 number of iterations of the loop, and we never get to unrolling it.
764 * The code in expand_divmod that was suspected to cause creation of
765 branching code seems to be only accessed for signed division. The
766 divisions used by # of iterations analysis are always unsigned.
767 Problems might arise on architectures that emits branching code
768 for some operations that may appear in the unroller (especially
769 for division), but we have no such architectures.
771 Considering all this, it was decided that we should for now assume
772 that INSNS can in theory contain control flow insns, but in practice
773 it never does. So we don't handle the theoretical case, and should
774 a real failure ever show up, we have a pretty good clue for how to
780 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
781 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
782 in order to create a jump. */
785 compare_and_jump_seq (rtx op0
, rtx op1
, enum rtx_code comp
, rtx label
, int prob
,
788 rtx_insn
*seq
, *jump
;
792 mode
= GET_MODE (op0
);
793 if (mode
== VOIDmode
)
794 mode
= GET_MODE (op1
);
797 if (GET_MODE_CLASS (mode
) == MODE_CC
)
799 /* A hack -- there seems to be no easy generic way how to make a
800 conditional jump from a ccmode comparison. */
802 cond
= XEXP (SET_SRC (pc_set (cinsn
)), 0);
803 gcc_assert (GET_CODE (cond
) == comp
);
804 gcc_assert (rtx_equal_p (op0
, XEXP (cond
, 0)));
805 gcc_assert (rtx_equal_p (op1
, XEXP (cond
, 1)));
806 emit_jump_insn (copy_insn (PATTERN (cinsn
)));
807 jump
= get_last_insn ();
808 gcc_assert (JUMP_P (jump
));
809 JUMP_LABEL (jump
) = JUMP_LABEL (cinsn
);
810 LABEL_NUSES (JUMP_LABEL (jump
))++;
811 redirect_jump (jump
, label
, 0);
817 op0
= force_operand (op0
, NULL_RTX
);
818 op1
= force_operand (op1
, NULL_RTX
);
819 do_compare_rtx_and_jump (op0
, op1
, comp
, 0,
820 mode
, NULL_RTX
, NULL_RTX
, label
, -1);
821 jump
= get_last_insn ();
822 gcc_assert (JUMP_P (jump
));
823 JUMP_LABEL (jump
) = label
;
824 LABEL_NUSES (label
)++;
826 add_int_reg_note (jump
, REG_BR_PROB
, prob
);
834 /* Unroll LOOP for which we are able to count number of iterations in runtime
835 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
836 extra care for case n < 0):
838 for (i = 0; i < n; i++)
841 ==> (LOOP->LPT_DECISION.TIMES == 3)
866 unroll_loop_runtime_iterations (struct loop
*loop
)
868 rtx old_niter
, niter
, tmp
;
869 rtx_insn
*init_code
, *branch_code
;
871 basic_block preheader
, *body
, swtch
, ezc_swtch
;
876 bool extra_zero_check
, last_may_exit
;
877 unsigned max_unroll
= loop
->lpt_decision
.times
;
878 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
879 bool exit_at_end
= loop_exit_at_end_p (loop
);
880 struct opt_info
*opt_info
= NULL
;
883 if (flag_split_ivs_in_unroller
884 || flag_variable_expansion_in_unroller
)
885 opt_info
= analyze_insns_in_loop (loop
);
887 /* Remember blocks whose dominators will have to be updated. */
888 auto_vec
<basic_block
> dom_bbs
;
890 body
= get_loop_body (loop
);
891 for (i
= 0; i
< loop
->num_nodes
; i
++)
893 vec
<basic_block
> ldom
;
896 ldom
= get_dominated_by (CDI_DOMINATORS
, body
[i
]);
897 FOR_EACH_VEC_ELT (ldom
, j
, bb
)
898 if (!flow_bb_inside_loop_p (loop
, bb
))
899 dom_bbs
.safe_push (bb
);
907 /* Leave exit in first copy (for explanation why see comment in
908 unroll_loop_constant_iterations). */
910 n_peel
= max_unroll
- 1;
911 extra_zero_check
= true;
912 last_may_exit
= false;
916 /* Leave exit in last copy (for explanation why see comment in
917 unroll_loop_constant_iterations). */
918 may_exit_copy
= max_unroll
;
920 extra_zero_check
= false;
921 last_may_exit
= true;
924 /* Get expression for number of iterations. */
926 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
927 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
929 emit_move_insn (niter
, tmp
);
931 /* Count modulo by ANDing it with max_unroll; we use the fact that
932 the number of unrollings is a power of two, and thus this is correct
933 even if there is overflow in the computation. */
934 niter
= expand_simple_binop (desc
->mode
, AND
,
935 niter
, gen_int_mode (max_unroll
, desc
->mode
),
936 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
938 init_code
= get_insns ();
940 unshare_all_rtl_in_chain (init_code
);
942 /* Precondition the loop. */
943 split_edge_and_insert (loop_preheader_edge (loop
), init_code
);
945 auto_vec
<edge
> remove_edges
;
947 wont_exit
= sbitmap_alloc (max_unroll
+ 2);
949 /* Peel the first copy of loop body (almost always we must leave exit test
950 here; the only exception is when we have extra zero check and the number
951 of iterations is reliable. Also record the place of (possible) extra
953 bitmap_clear (wont_exit
);
955 && !desc
->noloop_assumptions
)
956 bitmap_set_bit (wont_exit
, 1);
957 ezc_swtch
= loop_preheader_edge (loop
)->src
;
958 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
959 1, wont_exit
, desc
->out_edge
,
961 DLTHE_FLAG_UPDATE_FREQ
);
964 /* Record the place where switch will be built for preconditioning. */
965 swtch
= split_edge (loop_preheader_edge (loop
));
967 for (i
= 0; i
< n_peel
; i
++)
970 bitmap_clear (wont_exit
);
971 if (i
!= n_peel
- 1 || !last_may_exit
)
972 bitmap_set_bit (wont_exit
, 1);
973 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
974 1, wont_exit
, desc
->out_edge
,
976 DLTHE_FLAG_UPDATE_FREQ
);
979 /* Create item for switch. */
980 j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
981 p
= REG_BR_PROB_BASE
/ (i
+ 2);
983 preheader
= split_edge (loop_preheader_edge (loop
));
984 branch_code
= compare_and_jump_seq (copy_rtx (niter
), GEN_INT (j
), EQ
,
985 block_label (preheader
), p
,
988 /* We rely on the fact that the compare and jump cannot be optimized out,
989 and hence the cfg we create is correct. */
990 gcc_assert (branch_code
!= NULL_RTX
);
992 swtch
= split_edge_and_insert (single_pred_edge (swtch
), branch_code
);
993 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
994 single_pred_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
995 e
= make_edge (swtch
, preheader
,
996 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
997 e
->count
= RDIV (preheader
->count
* REG_BR_PROB_BASE
, p
);
1001 if (extra_zero_check
)
1003 /* Add branch for zero iterations. */
1004 p
= REG_BR_PROB_BASE
/ (max_unroll
+ 1);
1006 preheader
= split_edge (loop_preheader_edge (loop
));
1007 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
1008 block_label (preheader
), p
,
1010 gcc_assert (branch_code
!= NULL_RTX
);
1012 swtch
= split_edge_and_insert (single_succ_edge (swtch
), branch_code
);
1013 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1014 single_succ_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
1015 e
= make_edge (swtch
, preheader
,
1016 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1017 e
->count
= RDIV (preheader
->count
* REG_BR_PROB_BASE
, p
);
1021 /* Recount dominators for outer blocks. */
1022 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, false);
1024 /* And unroll loop. */
1026 bitmap_ones (wont_exit
);
1027 bitmap_clear_bit (wont_exit
, may_exit_copy
);
1028 opt_info_start_duplication (opt_info
);
1030 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1032 wont_exit
, desc
->out_edge
,
1034 DLTHE_FLAG_UPDATE_FREQ
1036 ? DLTHE_RECORD_COPY_NUMBER
1042 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
1043 free_opt_info (opt_info
);
1050 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
1051 /* Find a new in and out edge; they are in the last copy we have
1054 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
1056 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
1057 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
1061 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
1062 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
1066 /* Remove the edges. */
1067 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
1070 /* We must be careful when updating the number of iterations due to
1071 preconditioning and the fact that the value must be valid at entry
1072 of the loop. After passing through the above code, we see that
1073 the correct new number of iterations is this: */
1074 gcc_assert (!desc
->const_iter
);
1076 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
,
1077 gen_int_mode (max_unroll
+ 1, desc
->mode
));
1078 loop
->nb_iterations_upper_bound
1079 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
1080 if (loop
->any_estimate
)
1081 loop
->nb_iterations_estimate
1082 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
1086 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1087 desc
->noloop_assumptions
= NULL_RTX
;
1088 --loop
->nb_iterations_upper_bound
;
1089 if (loop
->any_estimate
1090 && loop
->nb_iterations_estimate
!= 0)
1091 --loop
->nb_iterations_estimate
;
1093 loop
->any_estimate
= false;
1098 ";; Unrolled loop %d times, counting # of iterations "
1099 "in runtime, %i insns\n",
1100 max_unroll
, num_loop_insns (loop
));
1103 /* Decide whether to unroll LOOP stupidly and how much. */
1105 decide_unroll_stupid (struct loop
*loop
, int flags
)
1107 unsigned nunroll
, nunroll_by_av
, i
;
1108 struct niter_desc
*desc
;
1109 widest_int iterations
;
1111 if (!(flags
& UAP_UNROLL_ALL
))
1113 /* We were not asked to, just return back silently. */
1118 fprintf (dump_file
, "\n;; Considering unrolling loop stupidly\n");
1120 /* nunroll = total number of copies of the original loop body in
1121 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1122 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
1124 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
1125 if (nunroll
> nunroll_by_av
)
1126 nunroll
= nunroll_by_av
;
1127 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
1128 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
1130 if (targetm
.loop_unroll_adjust
)
1131 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
1133 /* Skip big loops. */
1137 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1141 /* Check for simple loops. */
1142 desc
= get_simple_loop_desc (loop
);
1144 /* Check simpleness. */
1145 if (desc
->simple_p
&& !desc
->assumptions
)
1148 fprintf (dump_file
, ";; The loop is simple\n");
1152 /* Do not unroll loops with branches inside -- it increases number
1154 TODO: this heuristic needs tunning; call inside the loop body
1155 is also relatively good reason to not unroll. */
1156 if (num_loop_branches (loop
) > 1)
1159 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1163 /* Check whether the loop rolls. */
1164 if ((get_estimated_loop_iterations (loop
, &iterations
)
1165 || get_max_loop_iterations (loop
, &iterations
))
1166 && wi::ltu_p (iterations
, 2 * nunroll
))
1169 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1173 /* Success. Now force nunroll to be power of 2, as it seems that this
1174 improves results (partially because of better alignments, partially
1175 because of some dark magic). */
1176 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1179 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1180 loop
->lpt_decision
.times
= i
- 1;
1183 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1188 ==> (LOOP->LPT_DECISION.TIMES == 3)
1202 unroll_loop_stupid (struct loop
*loop
)
1205 unsigned nunroll
= loop
->lpt_decision
.times
;
1206 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1207 struct opt_info
*opt_info
= NULL
;
1210 if (flag_split_ivs_in_unroller
1211 || flag_variable_expansion_in_unroller
)
1212 opt_info
= analyze_insns_in_loop (loop
);
1215 wont_exit
= sbitmap_alloc (nunroll
+ 1);
1216 bitmap_clear (wont_exit
);
1217 opt_info_start_duplication (opt_info
);
1219 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1222 DLTHE_FLAG_UPDATE_FREQ
1224 ? DLTHE_RECORD_COPY_NUMBER
1230 apply_opt_in_copies (opt_info
, nunroll
, true, true);
1231 free_opt_info (opt_info
);
1238 /* We indeed may get here provided that there are nontrivial assumptions
1239 for a loop to be really simple. We could update the counts, but the
1240 problem is that we are unable to decide which exit will be taken
1241 (not really true in case the number of iterations is constant,
1242 but no one will do anything with this information, so we do not
1244 desc
->simple_p
= false;
1248 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1249 nunroll
, num_loop_insns (loop
));
1252 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1253 Set *DEBUG_USES to the number of debug insns that reference the
1257 referenced_in_one_insn_in_loop_p (struct loop
*loop
, rtx reg
,
1260 basic_block
*body
, bb
;
1265 body
= get_loop_body (loop
);
1266 for (i
= 0; i
< loop
->num_nodes
; i
++)
1270 FOR_BB_INSNS (bb
, insn
)
1271 if (!rtx_referenced_p (reg
, insn
))
1273 else if (DEBUG_INSN_P (insn
))
1275 else if (++count_ref
> 1)
1279 return (count_ref
== 1);
1282 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1285 reset_debug_uses_in_loop (struct loop
*loop
, rtx reg
, int debug_uses
)
1287 basic_block
*body
, bb
;
1291 body
= get_loop_body (loop
);
1292 for (i
= 0; debug_uses
&& i
< loop
->num_nodes
; i
++)
1296 FOR_BB_INSNS (bb
, insn
)
1297 if (!DEBUG_INSN_P (insn
) || !rtx_referenced_p (reg
, insn
))
1301 validate_change (insn
, &INSN_VAR_LOCATION_LOC (insn
),
1302 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1310 /* Determine whether INSN contains an accumulator
1311 which can be expanded into separate copies,
1312 one for each copy of the LOOP body.
1314 for (i = 0 ; i < n; i++)
1328 Return NULL if INSN contains no opportunity for expansion of accumulator.
1329 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1330 information and return a pointer to it.
1333 static struct var_to_expand
*
1334 analyze_insn_to_expand_var (struct loop
*loop
, rtx_insn
*insn
)
1337 struct var_to_expand
*ves
;
1342 set
= single_set (insn
);
1346 dest
= SET_DEST (set
);
1347 src
= SET_SRC (set
);
1348 code
= GET_CODE (src
);
1350 if (code
!= PLUS
&& code
!= MINUS
&& code
!= MULT
&& code
!= FMA
)
1353 if (FLOAT_MODE_P (GET_MODE (dest
)))
1355 if (!flag_associative_math
)
1357 /* In the case of FMA, we're also changing the rounding. */
1358 if (code
== FMA
&& !flag_unsafe_math_optimizations
)
1362 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1363 in MD. But if there is no optab to generate the insn, we can not
1364 perform the variable expansion. This can happen if an MD provides
1365 an insn but not a named pattern to generate it, for example to avoid
1366 producing code that needs additional mode switches like for x87/mmx.
1368 So we check have_insn_for which looks for an optab for the operation
1369 in SRC. If it doesn't exist, we can't perform the expansion even
1370 though INSN is valid. */
1371 if (!have_insn_for (code
, GET_MODE (src
)))
1375 && !(GET_CODE (dest
) == SUBREG
1376 && REG_P (SUBREG_REG (dest
))))
1379 /* Find the accumulator use within the operation. */
1382 /* We only support accumulation via FMA in the ADD position. */
1383 if (!rtx_equal_p (dest
, XEXP (src
, 2)))
1387 else if (rtx_equal_p (dest
, XEXP (src
, 0)))
1389 else if (rtx_equal_p (dest
, XEXP (src
, 1)))
1391 /* The method of expansion that we are using; which includes the
1392 initialization of the expansions with zero and the summation of
1393 the expansions at the end of the computation will yield wrong
1394 results for (x = something - x) thus avoid using it in that case. */
1402 /* It must not otherwise be used. */
1405 if (rtx_referenced_p (dest
, XEXP (src
, 0))
1406 || rtx_referenced_p (dest
, XEXP (src
, 1)))
1409 else if (rtx_referenced_p (dest
, XEXP (src
, 1 - accum_pos
)))
1412 /* It must be used in exactly one insn. */
1413 if (!referenced_in_one_insn_in_loop_p (loop
, dest
, &debug_uses
))
1418 fprintf (dump_file
, "\n;; Expanding Accumulator ");
1419 print_rtl (dump_file
, dest
);
1420 fprintf (dump_file
, "\n");
1424 /* Instead of resetting the debug insns, we could replace each
1425 debug use in the loop with the sum or product of all expanded
1426 accummulators. Since we'll only know of all expansions at the
1427 end, we'd have to keep track of which vars_to_expand a debug
1428 insn in the loop references, take note of each copy of the
1429 debug insn during unrolling, and when it's all done, compute
1430 the sum or product of each variable and adjust the original
1431 debug insn and each copy thereof. What a pain! */
1432 reset_debug_uses_in_loop (loop
, dest
, debug_uses
);
1434 /* Record the accumulator to expand. */
1435 ves
= XNEW (struct var_to_expand
);
1437 ves
->reg
= copy_rtx (dest
);
1438 ves
->var_expansions
.create (1);
1440 ves
->op
= GET_CODE (src
);
1441 ves
->expansion_count
= 0;
1442 ves
->reuse_expansion
= 0;
1446 /* Determine whether there is an induction variable in INSN that
1447 we would like to split during unrolling.
1467 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1468 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1471 static struct iv_to_split
*
1472 analyze_iv_to_split_insn (rtx_insn
*insn
)
1476 struct iv_to_split
*ivts
;
1479 /* For now we just split the basic induction variables. Later this may be
1480 extended for example by selecting also addresses of memory references. */
1481 set
= single_set (insn
);
1485 dest
= SET_DEST (set
);
1489 if (!biv_p (insn
, dest
))
1492 ok
= iv_analyze_result (insn
, dest
, &iv
);
1494 /* This used to be an assert under the assumption that if biv_p returns
1495 true that iv_analyze_result must also return true. However, that
1496 assumption is not strictly correct as evidenced by pr25569.
1498 Returning NULL when iv_analyze_result returns false is safe and
1499 avoids the problems in pr25569 until the iv_analyze_* routines
1500 can be fixed, which is apparently hard and time consuming
1501 according to their author. */
1505 if (iv
.step
== const0_rtx
1506 || iv
.mode
!= iv
.extend_mode
)
1509 /* Record the insn to split. */
1510 ivts
= XNEW (struct iv_to_split
);
1512 ivts
->orig_var
= dest
;
1513 ivts
->base_var
= NULL_RTX
;
1514 ivts
->step
= iv
.step
;
1520 /* Determines which of insns in LOOP can be optimized.
1521 Return a OPT_INFO struct with the relevant hash tables filled
1522 with all insns to be optimized. The FIRST_NEW_BLOCK field
1523 is undefined for the return value. */
1525 static struct opt_info
*
1526 analyze_insns_in_loop (struct loop
*loop
)
1528 basic_block
*body
, bb
;
1530 struct opt_info
*opt_info
= XCNEW (struct opt_info
);
1532 struct iv_to_split
*ivts
= NULL
;
1533 struct var_to_expand
*ves
= NULL
;
1534 iv_to_split
**slot1
;
1535 var_to_expand
**slot2
;
1536 vec
<edge
> edges
= get_loop_exit_edges (loop
);
1538 bool can_apply
= false;
1540 iv_analysis_loop_init (loop
);
1542 body
= get_loop_body (loop
);
1544 if (flag_split_ivs_in_unroller
)
1546 opt_info
->insns_to_split
1547 = new hash_table
<iv_split_hasher
> (5 * loop
->num_nodes
);
1548 opt_info
->iv_to_split_head
= NULL
;
1549 opt_info
->iv_to_split_tail
= &opt_info
->iv_to_split_head
;
1552 /* Record the loop exit bb and loop preheader before the unrolling. */
1553 opt_info
->loop_preheader
= loop_preheader_edge (loop
)->src
;
1555 if (edges
.length () == 1)
1558 if (!(exit
->flags
& EDGE_COMPLEX
))
1560 opt_info
->loop_exit
= split_edge (exit
);
1565 if (flag_variable_expansion_in_unroller
1568 opt_info
->insns_with_var_to_expand
1569 = new hash_table
<var_expand_hasher
> (5 * loop
->num_nodes
);
1570 opt_info
->var_to_expand_head
= NULL
;
1571 opt_info
->var_to_expand_tail
= &opt_info
->var_to_expand_head
;
1574 for (i
= 0; i
< loop
->num_nodes
; i
++)
1577 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1580 FOR_BB_INSNS (bb
, insn
)
1585 if (opt_info
->insns_to_split
)
1586 ivts
= analyze_iv_to_split_insn (insn
);
1590 slot1
= opt_info
->insns_to_split
->find_slot (ivts
, INSERT
);
1591 gcc_assert (*slot1
== NULL
);
1593 *opt_info
->iv_to_split_tail
= ivts
;
1594 opt_info
->iv_to_split_tail
= &ivts
->next
;
1598 if (opt_info
->insns_with_var_to_expand
)
1599 ves
= analyze_insn_to_expand_var (loop
, insn
);
1603 slot2
= opt_info
->insns_with_var_to_expand
->find_slot (ves
, INSERT
);
1604 gcc_assert (*slot2
== NULL
);
1606 *opt_info
->var_to_expand_tail
= ves
;
1607 opt_info
->var_to_expand_tail
= &ves
->next
;
1617 /* Called just before loop duplication. Records start of duplicated area
1621 opt_info_start_duplication (struct opt_info
*opt_info
)
1624 opt_info
->first_new_block
= last_basic_block_for_fn (cfun
);
1627 /* Determine the number of iterations between initialization of the base
1628 variable and the current copy (N_COPY). N_COPIES is the total number
1629 of newly created copies. UNROLLING is true if we are unrolling
1630 (not peeling) the loop. */
1633 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1637 /* If we are unrolling, initialization is done in the original loop
1643 /* If we are peeling, the copy in that the initialization occurs has
1644 number 1. The original loop (number 0) is the last. */
1652 /* Allocate basic variable for the induction variable chain. */
1655 allocate_basic_variable (struct iv_to_split
*ivts
)
1657 rtx expr
= SET_SRC (single_set (ivts
->insn
));
1659 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1662 /* Insert initialization of basic variable of IVTS before INSN, taking
1663 the initial value from INSN. */
1666 insert_base_initialization (struct iv_to_split
*ivts
, rtx_insn
*insn
)
1668 rtx expr
= copy_rtx (SET_SRC (single_set (insn
)));
1672 expr
= force_operand (expr
, ivts
->base_var
);
1673 if (expr
!= ivts
->base_var
)
1674 emit_move_insn (ivts
->base_var
, expr
);
1678 emit_insn_before (seq
, insn
);
1681 /* Replace the use of induction variable described in IVTS in INSN
1682 by base variable + DELTA * step. */
1685 split_iv (struct iv_to_split
*ivts
, rtx_insn
*insn
, unsigned delta
)
1687 rtx expr
, *loc
, incr
, var
;
1689 machine_mode mode
= GET_MODE (ivts
->base_var
);
1692 /* Construct base + DELTA * step. */
1694 expr
= ivts
->base_var
;
1697 incr
= simplify_gen_binary (MULT
, mode
,
1698 ivts
->step
, gen_int_mode (delta
, mode
));
1699 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1700 ivts
->base_var
, incr
);
1703 /* Figure out where to do the replacement. */
1704 loc
= &SET_SRC (single_set (insn
));
1706 /* If we can make the replacement right away, we're done. */
1707 if (validate_change (insn
, loc
, expr
, 0))
1710 /* Otherwise, force EXPR into a register and try again. */
1712 var
= gen_reg_rtx (mode
);
1713 expr
= force_operand (expr
, var
);
1715 emit_move_insn (var
, expr
);
1718 emit_insn_before (seq
, insn
);
1720 if (validate_change (insn
, loc
, var
, 0))
1723 /* The last chance. Try recreating the assignment in insn
1724 completely from scratch. */
1725 set
= single_set (insn
);
1730 src
= copy_rtx (SET_SRC (set
));
1731 dest
= copy_rtx (SET_DEST (set
));
1732 src
= force_operand (src
, dest
);
1734 emit_move_insn (dest
, src
);
1738 emit_insn_before (seq
, insn
);
1743 /* Return one expansion of the accumulator recorded in struct VE. */
1746 get_expansion (struct var_to_expand
*ve
)
1750 if (ve
->reuse_expansion
== 0)
1753 reg
= ve
->var_expansions
[ve
->reuse_expansion
- 1];
1755 if (ve
->var_expansions
.length () == (unsigned) ve
->reuse_expansion
)
1756 ve
->reuse_expansion
= 0;
1758 ve
->reuse_expansion
++;
1764 /* Given INSN replace the uses of the accumulator recorded in VE
1765 with a new register. */
1768 expand_var_during_unrolling (struct var_to_expand
*ve
, rtx_insn
*insn
)
1771 bool really_new_expansion
= false;
1773 set
= single_set (insn
);
1776 /* Generate a new register only if the expansion limit has not been
1777 reached. Else reuse an already existing expansion. */
1778 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS
) > ve
->expansion_count
)
1780 really_new_expansion
= true;
1781 new_reg
= gen_reg_rtx (GET_MODE (ve
->reg
));
1784 new_reg
= get_expansion (ve
);
1786 validate_replace_rtx_group (SET_DEST (set
), new_reg
, insn
);
1787 if (apply_change_group ())
1788 if (really_new_expansion
)
1790 ve
->var_expansions
.safe_push (new_reg
);
1791 ve
->expansion_count
++;
1795 /* Initialize the variable expansions in loop preheader. PLACE is the
1796 loop-preheader basic block where the initialization of the
1797 expansions should take place. The expansions are initialized with
1798 (-0) when the operation is plus or minus to honor sign zero. This
1799 way we can prevent cases where the sign of the final result is
1800 effected by the sign of the expansion. Here is an example to
1803 for (i = 0 ; i < n; i++)
1817 When SUM is initialized with -zero and SOMETHING is also -zero; the
1818 final result of sum should be -zero thus the expansions sum1 and sum2
1819 should be initialized with -zero as well (otherwise we will get +zero
1820 as the final result). */
1823 insert_var_expansion_initialization (struct var_to_expand
*ve
,
1829 machine_mode mode
= GET_MODE (ve
->reg
);
1830 bool honor_signed_zero_p
= HONOR_SIGNED_ZEROS (mode
);
1832 if (ve
->var_expansions
.length () == 0)
1839 /* Note that we only accumulate FMA via the ADD operand. */
1842 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1844 if (honor_signed_zero_p
)
1845 zero_init
= simplify_gen_unary (NEG
, mode
, CONST0_RTX (mode
), mode
);
1847 zero_init
= CONST0_RTX (mode
);
1848 emit_move_insn (var
, zero_init
);
1853 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1855 zero_init
= CONST1_RTX (GET_MODE (var
));
1856 emit_move_insn (var
, zero_init
);
1867 emit_insn_after (seq
, BB_END (place
));
1870 /* Combine the variable expansions at the loop exit. PLACE is the
1871 loop exit basic block where the summation of the expansions should
1875 combine_var_copies_in_loop_exit (struct var_to_expand
*ve
, basic_block place
)
1879 rtx_insn
*seq
, *insn
;
1882 if (ve
->var_expansions
.length () == 0)
1889 /* Note that we only accumulate FMA via the ADD operand. */
1892 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1893 sum
= simplify_gen_binary (PLUS
, GET_MODE (ve
->reg
), var
, sum
);
1897 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1898 sum
= simplify_gen_binary (MULT
, GET_MODE (ve
->reg
), var
, sum
);
1905 expr
= force_operand (sum
, ve
->reg
);
1906 if (expr
!= ve
->reg
)
1907 emit_move_insn (ve
->reg
, expr
);
1911 insn
= BB_HEAD (place
);
1912 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
1913 insn
= NEXT_INSN (insn
);
1915 emit_insn_after (seq
, insn
);
1918 /* Strip away REG_EQUAL notes for IVs we're splitting.
1920 Updating REG_EQUAL notes for IVs we split is tricky: We
1921 cannot tell until after unrolling, DF-rescanning, and liveness
1922 updating, whether an EQ_USE is reached by the split IV while
1923 the IV reg is still live. See PR55006.
1925 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1926 because RTL loop-iv requires us to defer rescanning insns and
1927 any notes attached to them. So resort to old techniques... */
1930 maybe_strip_eq_note_for_split_iv (struct opt_info
*opt_info
, rtx_insn
*insn
)
1932 struct iv_to_split
*ivts
;
1933 rtx note
= find_reg_equal_equiv_note (insn
);
1936 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1937 if (reg_mentioned_p (ivts
->orig_var
, note
))
1939 remove_note (insn
, note
);
1944 /* Apply loop optimizations in loop copies using the
1945 data which gathered during the unrolling. Structure
1946 OPT_INFO record that data.
1948 UNROLLING is true if we unrolled (not peeled) the loop.
1949 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1950 the loop (as it should happen in complete unrolling, but not in ordinary
1951 peeling of the loop). */
1954 apply_opt_in_copies (struct opt_info
*opt_info
,
1955 unsigned n_copies
, bool unrolling
,
1956 bool rewrite_original_loop
)
1959 basic_block bb
, orig_bb
;
1960 rtx_insn
*insn
, *orig_insn
, *next
;
1961 struct iv_to_split ivts_templ
, *ivts
;
1962 struct var_to_expand ve_templ
, *ves
;
1964 /* Sanity check -- we need to put initialization in the original loop
1966 gcc_assert (!unrolling
|| rewrite_original_loop
);
1968 /* Allocate the basic variables (i0). */
1969 if (opt_info
->insns_to_split
)
1970 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1971 allocate_basic_variable (ivts
);
1973 for (i
= opt_info
->first_new_block
;
1974 i
< (unsigned) last_basic_block_for_fn (cfun
);
1977 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
1978 orig_bb
= get_bb_original (bb
);
1980 /* bb->aux holds position in copy sequence initialized by
1981 duplicate_loop_to_header_edge. */
1982 delta
= determine_split_iv_delta ((size_t)bb
->aux
, n_copies
,
1985 orig_insn
= BB_HEAD (orig_bb
);
1986 FOR_BB_INSNS_SAFE (bb
, insn
, next
)
1989 || (DEBUG_INSN_P (insn
)
1990 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn
)) == LABEL_DECL
))
1993 while (!INSN_P (orig_insn
)
1994 || (DEBUG_INSN_P (orig_insn
)
1995 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn
))
1997 orig_insn
= NEXT_INSN (orig_insn
);
1999 ivts_templ
.insn
= orig_insn
;
2000 ve_templ
.insn
= orig_insn
;
2002 /* Apply splitting iv optimization. */
2003 if (opt_info
->insns_to_split
)
2005 maybe_strip_eq_note_for_split_iv (opt_info
, insn
);
2007 ivts
= opt_info
->insns_to_split
->find (&ivts_templ
);
2011 gcc_assert (GET_CODE (PATTERN (insn
))
2012 == GET_CODE (PATTERN (orig_insn
)));
2015 insert_base_initialization (ivts
, insn
);
2016 split_iv (ivts
, insn
, delta
);
2019 /* Apply variable expansion optimization. */
2020 if (unrolling
&& opt_info
->insns_with_var_to_expand
)
2022 ves
= (struct var_to_expand
*)
2023 opt_info
->insns_with_var_to_expand
->find (&ve_templ
);
2026 gcc_assert (GET_CODE (PATTERN (insn
))
2027 == GET_CODE (PATTERN (orig_insn
)));
2028 expand_var_during_unrolling (ves
, insn
);
2031 orig_insn
= NEXT_INSN (orig_insn
);
2035 if (!rewrite_original_loop
)
2038 /* Initialize the variable expansions in the loop preheader
2039 and take care of combining them at the loop exit. */
2040 if (opt_info
->insns_with_var_to_expand
)
2042 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2043 insert_var_expansion_initialization (ves
, opt_info
->loop_preheader
);
2044 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2045 combine_var_copies_in_loop_exit (ves
, opt_info
->loop_exit
);
2048 /* Rewrite also the original loop body. Find them as originals of the blocks
2049 in the last copied iteration, i.e. those that have
2050 get_bb_copy (get_bb_original (bb)) == bb. */
2051 for (i
= opt_info
->first_new_block
;
2052 i
< (unsigned) last_basic_block_for_fn (cfun
);
2055 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2056 orig_bb
= get_bb_original (bb
);
2057 if (get_bb_copy (orig_bb
) != bb
)
2060 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
2061 for (orig_insn
= BB_HEAD (orig_bb
);
2062 orig_insn
!= NEXT_INSN (BB_END (bb
));
2065 next
= NEXT_INSN (orig_insn
);
2067 if (!INSN_P (orig_insn
))
2070 ivts_templ
.insn
= orig_insn
;
2071 if (opt_info
->insns_to_split
)
2073 maybe_strip_eq_note_for_split_iv (opt_info
, orig_insn
);
2075 ivts
= (struct iv_to_split
*)
2076 opt_info
->insns_to_split
->find (&ivts_templ
);
2080 insert_base_initialization (ivts
, orig_insn
);
2081 split_iv (ivts
, orig_insn
, delta
);
2090 /* Release OPT_INFO. */
2093 free_opt_info (struct opt_info
*opt_info
)
2095 delete opt_info
->insns_to_split
;
2096 opt_info
->insns_to_split
= NULL
;
2097 if (opt_info
->insns_with_var_to_expand
)
2099 struct var_to_expand
*ves
;
2101 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2102 ves
->var_expansions
.release ();
2103 delete opt_info
->insns_with_var_to_expand
;
2104 opt_info
->insns_with_var_to_expand
= NULL
;