2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
39 /* This pass performs loop unrolling. We only perform this
40 optimization on innermost loops (with single exception) because
41 the impact on performance is greatest here, and we want to avoid
42 unnecessary code size growth. The gain is caused by greater sequentiality
43 of code, better code to optimize for further passes and in some cases
44 by fewer testings of exit conditions. The main problem is code growth,
45 that impacts performance negatively due to effect of caches.
49 -- unrolling of loops that roll constant times; this is almost always
50 win, as we get rid of exit condition tests.
51 -- unrolling of loops that roll number of times that we can compute
52 in runtime; we also get rid of exit condition tests here, but there
53 is the extra expense for calculating the number of iterations
54 -- simple unrolling of remaining loops; this is performed only if we
55 are asked to, as the gain is questionable in this case and often
56 it may even slow down the code
57 For more detailed descriptions of each of those, see comments at
58 appropriate function below.
60 There is a lot of parameters (defined and described in params.def) that
61 control how much we unroll.
63 ??? A great problem is that we don't have a good way how to determine
64 how many times we should unroll the loop; the experiments I have made
65 showed that this choice may affect performance in order of several %.
68 /* Information about induction variables to split. */
72 rtx_insn
*insn
; /* The insn in that the induction variable occurs. */
73 rtx orig_var
; /* The variable (register) for the IV before split. */
74 rtx base_var
; /* The variable on that the values in the further
75 iterations are based. */
76 rtx step
; /* Step of the induction variable. */
77 struct iv_to_split
*next
; /* Next entry in walking order. */
80 /* Information about accumulators to expand. */
84 rtx_insn
*insn
; /* The insn in that the variable expansion occurs. */
85 rtx reg
; /* The accumulator which is expanded. */
86 vec
<rtx
> var_expansions
; /* The copies of the accumulator which is expanded. */
87 struct var_to_expand
*next
; /* Next entry in walking order. */
88 enum rtx_code op
; /* The type of the accumulation - addition, subtraction
90 int expansion_count
; /* Count the number of expansions generated so far. */
91 int reuse_expansion
; /* The expansion we intend to reuse to expand
92 the accumulator. If REUSE_EXPANSION is 0 reuse
93 the original accumulator. Else use
94 var_expansions[REUSE_EXPANSION - 1]. */
97 /* Hashtable helper for iv_to_split. */
99 struct iv_split_hasher
: free_ptr_hash
<iv_to_split
>
101 static inline hashval_t
hash (const iv_to_split
*);
102 static inline bool equal (const iv_to_split
*, const iv_to_split
*);
106 /* A hash function for information about insns to split. */
109 iv_split_hasher::hash (const iv_to_split
*ivts
)
111 return (hashval_t
) INSN_UID (ivts
->insn
);
114 /* An equality functions for information about insns to split. */
117 iv_split_hasher::equal (const iv_to_split
*i1
, const iv_to_split
*i2
)
119 return i1
->insn
== i2
->insn
;
122 /* Hashtable helper for iv_to_split. */
124 struct var_expand_hasher
: free_ptr_hash
<var_to_expand
>
126 static inline hashval_t
hash (const var_to_expand
*);
127 static inline bool equal (const var_to_expand
*, const var_to_expand
*);
130 /* Return a hash for VES. */
133 var_expand_hasher::hash (const var_to_expand
*ves
)
135 return (hashval_t
) INSN_UID (ves
->insn
);
138 /* Return true if I1 and I2 refer to the same instruction. */
141 var_expand_hasher::equal (const var_to_expand
*i1
, const var_to_expand
*i2
)
143 return i1
->insn
== i2
->insn
;
146 /* Information about optimization applied in
147 the unrolled loop. */
151 hash_table
<iv_split_hasher
> *insns_to_split
; /* A hashtable of insns to
153 struct iv_to_split
*iv_to_split_head
; /* The first iv to split. */
154 struct iv_to_split
**iv_to_split_tail
; /* Pointer to the tail of the list. */
155 hash_table
<var_expand_hasher
> *insns_with_var_to_expand
; /* A hashtable of
156 insns with accumulators to expand. */
157 struct var_to_expand
*var_to_expand_head
; /* The first var to expand. */
158 struct var_to_expand
**var_to_expand_tail
; /* Pointer to the tail of the list. */
159 unsigned first_new_block
; /* The first basic block that was
161 basic_block loop_exit
; /* The loop exit basic block. */
162 basic_block loop_preheader
; /* The loop preheader basic block. */
165 static void decide_unroll_stupid (class loop
*, int);
166 static void decide_unroll_constant_iterations (class loop
*, int);
167 static void decide_unroll_runtime_iterations (class loop
*, int);
168 static void unroll_loop_stupid (class loop
*);
169 static void decide_unrolling (int);
170 static void unroll_loop_constant_iterations (class loop
*);
171 static void unroll_loop_runtime_iterations (class loop
*);
172 static struct opt_info
*analyze_insns_in_loop (class loop
*);
173 static void opt_info_start_duplication (struct opt_info
*);
174 static void apply_opt_in_copies (struct opt_info
*, unsigned, bool, bool);
175 static void free_opt_info (struct opt_info
*);
176 static struct var_to_expand
*analyze_insn_to_expand_var (class loop
*, rtx_insn
*);
177 static bool referenced_in_one_insn_in_loop_p (class loop
*, rtx
, int *);
178 static struct iv_to_split
*analyze_iv_to_split_insn (rtx_insn
*);
179 static void expand_var_during_unrolling (struct var_to_expand
*, rtx_insn
*);
180 static void insert_var_expansion_initialization (struct var_to_expand
*,
182 static void combine_var_copies_in_loop_exit (struct var_to_expand
*,
184 static rtx
get_expansion (struct var_to_expand
*);
186 /* Emit a message summarizing the unroll that will be
187 performed for LOOP, along with the loop's location LOCUS, if
188 appropriate given the dump or -fopt-info settings. */
191 report_unroll (class loop
*loop
, dump_location_t locus
)
193 dump_flags_t report_flags
= MSG_OPTIMIZED_LOCATIONS
| TDF_DETAILS
;
195 if (loop
->lpt_decision
.decision
== LPT_NONE
)
198 if (!dump_enabled_p ())
201 dump_metadata_t
metadata (report_flags
, locus
.get_impl_location ());
202 dump_printf_loc (metadata
, locus
.get_user_location (),
203 "loop unrolled %d times",
204 loop
->lpt_decision
.times
);
205 if (profile_info
&& loop
->header
->count
.initialized_p ())
206 dump_printf (metadata
,
207 " (header execution count %d)",
208 (int)loop
->header
->count
.to_gcov_type ());
210 dump_printf (metadata
, "\n");
213 /* Decide whether unroll loops and how much. */
215 decide_unrolling (int flags
)
217 /* Scan the loops, inner ones first. */
218 for (auto loop
: loops_list (cfun
, LI_FROM_INNERMOST
))
220 loop
->lpt_decision
.decision
= LPT_NONE
;
221 dump_user_location_t locus
= get_loop_location (loop
);
223 if (dump_enabled_p ())
224 dump_printf_loc (MSG_NOTE
, locus
,
225 "considering unrolling loop %d at BB %d\n",
226 loop
->num
, loop
->header
->index
);
228 if (loop
->unroll
== 1)
232 ";; Not unrolling loop, user didn't want it unrolled\n");
236 /* Do not peel cold areas. */
237 if (optimize_loop_for_size_p (loop
))
240 fprintf (dump_file
, ";; Not considering loop, cold area\n");
244 /* Can the loop be manipulated? */
245 if (!can_duplicate_loop_p (loop
))
249 ";; Not considering loop, cannot duplicate\n");
253 /* Skip non-innermost loops. */
257 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
261 loop
->ninsns
= num_loop_insns (loop
);
262 loop
->av_ninsns
= average_num_loop_insns (loop
);
264 /* Try transformations one by one in decreasing order of priority. */
265 decide_unroll_constant_iterations (loop
, flags
);
266 if (loop
->lpt_decision
.decision
== LPT_NONE
)
267 decide_unroll_runtime_iterations (loop
, flags
);
268 if (loop
->lpt_decision
.decision
== LPT_NONE
)
269 decide_unroll_stupid (loop
, flags
);
271 report_unroll (loop
, locus
);
277 unroll_loops (int flags
)
279 bool changed
= false;
281 /* Now decide rest of unrolling. */
282 decide_unrolling (flags
);
284 /* Scan the loops, inner ones first. */
285 for (auto loop
: loops_list (cfun
, LI_FROM_INNERMOST
))
287 /* And perform the appropriate transformations. */
288 switch (loop
->lpt_decision
.decision
)
290 case LPT_UNROLL_CONSTANT
:
291 unroll_loop_constant_iterations (loop
);
294 case LPT_UNROLL_RUNTIME
:
295 unroll_loop_runtime_iterations (loop
);
298 case LPT_UNROLL_STUPID
:
299 unroll_loop_stupid (loop
);
311 calculate_dominance_info (CDI_DOMINATORS
);
312 fix_loop_structure (NULL
);
318 /* Check whether exit of the LOOP is at the end of loop body. */
321 loop_exit_at_end_p (class loop
*loop
)
323 class niter_desc
*desc
= get_simple_loop_desc (loop
);
326 /* We should never have conditional in latch block. */
327 gcc_assert (desc
->in_edge
->dest
!= loop
->header
);
329 if (desc
->in_edge
->dest
!= loop
->latch
)
332 /* Check that the latch is empty. */
333 FOR_BB_INSNS (loop
->latch
, insn
)
335 if (INSN_P (insn
) && active_insn_p (insn
))
342 /* Decide whether to unroll LOOP iterating constant number of times
346 decide_unroll_constant_iterations (class loop
*loop
, int flags
)
348 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
349 class niter_desc
*desc
;
350 widest_int iterations
;
352 /* If we were not asked to unroll this loop, just return back silently. */
353 if (!(flags
& UAP_UNROLL
) && !loop
->unroll
)
356 if (dump_enabled_p ())
357 dump_printf (MSG_NOTE
,
358 "considering unrolling loop with constant "
359 "number of iterations\n");
361 /* nunroll = total number of copies of the original loop body in
362 unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
363 nunroll
= param_max_unrolled_insns
/ loop
->ninsns
;
365 = param_max_average_unrolled_insns
/ loop
->av_ninsns
;
366 if (nunroll
> nunroll_by_av
)
367 nunroll
= nunroll_by_av
;
368 if (nunroll
> (unsigned) param_max_unroll_times
)
369 nunroll
= param_max_unroll_times
;
371 if (targetm
.loop_unroll_adjust
)
372 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
374 /* Skip big loops. */
378 fprintf (dump_file
, ";; Not considering loop, is too big\n");
382 /* Check for simple loops. */
383 desc
= get_simple_loop_desc (loop
);
385 /* Check number of iterations. */
386 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
390 ";; Unable to prove that the loop iterates constant times\n");
394 /* Check for an explicit unrolling factor. */
395 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
397 /* However we cannot unroll completely at the RTL level a loop with
398 constant number of iterations; it should have been peeled instead. */
399 if (desc
->niter
== 0 || (unsigned) loop
->unroll
> desc
->niter
- 1)
402 fprintf (dump_file
, ";; Loop should have been peeled\n");
406 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
407 loop
->lpt_decision
.times
= loop
->unroll
- 1;
412 /* Check whether the loop rolls enough to consider.
413 Consult also loop bounds and profile; in the case the loop has more
414 than one exit it may well loop less than determined maximal number
416 if (desc
->niter
< 2 * nunroll
417 || ((get_estimated_loop_iterations (loop
, &iterations
)
418 || get_likely_max_loop_iterations (loop
, &iterations
))
419 && wi::ltu_p (iterations
, 2 * nunroll
)))
422 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
426 /* Success; now compute number of iterations to unroll. We alter
427 nunroll so that as few as possible copies of loop body are
428 necessary, while still not decreasing the number of unrollings
429 too much (at most by 1). */
430 best_copies
= 2 * nunroll
+ 10;
433 if (i
> desc
->niter
- 2)
436 for (; i
>= nunroll
- 1; i
--)
438 unsigned exit_mod
= desc
->niter
% (i
+ 1);
440 if (!loop_exit_at_end_p (loop
))
441 n_copies
= exit_mod
+ i
+ 1;
442 else if (exit_mod
!= (unsigned) i
443 || desc
->noloop_assumptions
!= NULL_RTX
)
444 n_copies
= exit_mod
+ i
+ 2;
448 if (n_copies
< best_copies
)
450 best_copies
= n_copies
;
455 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
456 loop
->lpt_decision
.times
= best_unroll
;
459 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
460 The transformation does this:
462 for (i = 0; i < 102; i++)
465 ==> (LOOP->LPT_DECISION.TIMES == 3)
479 unroll_loop_constant_iterations (class loop
*loop
)
481 unsigned HOST_WIDE_INT niter
;
485 unsigned max_unroll
= loop
->lpt_decision
.times
;
486 class niter_desc
*desc
= get_simple_loop_desc (loop
);
487 bool exit_at_end
= loop_exit_at_end_p (loop
);
488 struct opt_info
*opt_info
= NULL
;
493 /* Should not get here (such loop should be peeled instead). */
494 gcc_assert (niter
> max_unroll
+ 1);
496 exit_mod
= niter
% (max_unroll
+ 1);
498 auto_sbitmap
wont_exit (max_unroll
+ 2);
499 bitmap_ones (wont_exit
);
501 auto_vec
<edge
> remove_edges
;
502 if (flag_split_ivs_in_unroller
503 || flag_variable_expansion_in_unroller
)
504 opt_info
= analyze_insns_in_loop (loop
);
508 /* The exit is not at the end of the loop; leave exit test
509 in the first copy, so that the loops that start with test
510 of exit condition have continuous body after unrolling. */
513 fprintf (dump_file
, ";; Condition at beginning of loop.\n");
515 /* Peel exit_mod iterations. */
516 bitmap_clear_bit (wont_exit
, 0);
517 if (desc
->noloop_assumptions
)
518 bitmap_clear_bit (wont_exit
, 1);
522 opt_info_start_duplication (opt_info
);
523 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
525 wont_exit
, desc
->out_edge
,
527 DLTHE_FLAG_UPDATE_FREQ
528 | (opt_info
&& exit_mod
> 1
529 ? DLTHE_RECORD_COPY_NUMBER
533 if (opt_info
&& exit_mod
> 1)
534 apply_opt_in_copies (opt_info
, exit_mod
, false, false);
536 desc
->noloop_assumptions
= NULL_RTX
;
537 desc
->niter
-= exit_mod
;
538 loop
->nb_iterations_upper_bound
-= exit_mod
;
539 if (loop
->any_estimate
540 && wi::leu_p (exit_mod
, loop
->nb_iterations_estimate
))
541 loop
->nb_iterations_estimate
-= exit_mod
;
543 loop
->any_estimate
= false;
544 if (loop
->any_likely_upper_bound
545 && wi::leu_p (exit_mod
, loop
->nb_iterations_likely_upper_bound
))
546 loop
->nb_iterations_likely_upper_bound
-= exit_mod
;
548 loop
->any_likely_upper_bound
= false;
551 bitmap_set_bit (wont_exit
, 1);
555 /* Leave exit test in last copy, for the same reason as above if
556 the loop tests the condition at the end of loop body. */
559 fprintf (dump_file
, ";; Condition at end of loop.\n");
561 /* We know that niter >= max_unroll + 2; so we do not need to care of
562 case when we would exit before reaching the loop. So just peel
563 exit_mod + 1 iterations. */
564 if (exit_mod
!= max_unroll
565 || desc
->noloop_assumptions
)
567 bitmap_clear_bit (wont_exit
, 0);
568 if (desc
->noloop_assumptions
)
569 bitmap_clear_bit (wont_exit
, 1);
571 opt_info_start_duplication (opt_info
);
572 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
574 wont_exit
, desc
->out_edge
,
576 DLTHE_FLAG_UPDATE_FREQ
577 | (opt_info
&& exit_mod
> 0
578 ? DLTHE_RECORD_COPY_NUMBER
582 if (opt_info
&& exit_mod
> 0)
583 apply_opt_in_copies (opt_info
, exit_mod
+ 1, false, false);
585 desc
->niter
-= exit_mod
+ 1;
586 loop
->nb_iterations_upper_bound
-= exit_mod
+ 1;
587 if (loop
->any_estimate
588 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_estimate
))
589 loop
->nb_iterations_estimate
-= exit_mod
+ 1;
591 loop
->any_estimate
= false;
592 if (loop
->any_likely_upper_bound
593 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_likely_upper_bound
))
594 loop
->nb_iterations_likely_upper_bound
-= exit_mod
+ 1;
596 loop
->any_likely_upper_bound
= false;
597 desc
->noloop_assumptions
= NULL_RTX
;
599 bitmap_set_bit (wont_exit
, 0);
600 bitmap_set_bit (wont_exit
, 1);
603 bitmap_clear_bit (wont_exit
, max_unroll
);
606 /* Now unroll the loop. */
608 opt_info_start_duplication (opt_info
);
609 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
611 wont_exit
, desc
->out_edge
,
613 DLTHE_FLAG_UPDATE_FREQ
615 ? DLTHE_RECORD_COPY_NUMBER
621 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
622 free_opt_info (opt_info
);
627 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
628 /* Find a new in and out edge; they are in the last copy we have made. */
630 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
632 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
633 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
637 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
638 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
642 desc
->niter
/= max_unroll
+ 1;
643 loop
->nb_iterations_upper_bound
644 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
645 if (loop
->any_estimate
)
646 loop
->nb_iterations_estimate
647 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
648 if (loop
->any_likely_upper_bound
)
649 loop
->nb_iterations_likely_upper_bound
650 = wi::udiv_trunc (loop
->nb_iterations_likely_upper_bound
, max_unroll
+ 1);
651 desc
->niter_expr
= gen_int_mode (desc
->niter
, desc
->mode
);
653 /* Remove the edges. */
654 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
659 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
660 max_unroll
, num_loop_insns (loop
));
663 /* Decide whether to unroll LOOP iterating runtime computable number of times
666 decide_unroll_runtime_iterations (class loop
*loop
, int flags
)
668 unsigned nunroll
, nunroll_by_av
, i
;
669 class niter_desc
*desc
;
670 widest_int iterations
;
672 /* If we were not asked to unroll this loop, just return back silently. */
673 if (!(flags
& UAP_UNROLL
) && !loop
->unroll
)
676 if (dump_enabled_p ())
677 dump_printf (MSG_NOTE
,
678 "considering unrolling loop with runtime-"
679 "computable number of iterations\n");
681 /* nunroll = total number of copies of the original loop body in
682 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
683 nunroll
= param_max_unrolled_insns
/ loop
->ninsns
;
684 nunroll_by_av
= param_max_average_unrolled_insns
/ loop
->av_ninsns
;
685 if (nunroll
> nunroll_by_av
)
686 nunroll
= nunroll_by_av
;
687 if (nunroll
> (unsigned) param_max_unroll_times
)
688 nunroll
= param_max_unroll_times
;
690 if (targetm
.loop_unroll_adjust
)
691 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
693 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
694 nunroll
= loop
->unroll
;
696 /* Skip big loops. */
700 fprintf (dump_file
, ";; Not considering loop, is too big\n");
704 /* Check for simple loops. */
705 desc
= get_simple_loop_desc (loop
);
707 /* Check simpleness. */
708 if (!desc
->simple_p
|| desc
->assumptions
)
712 ";; Unable to prove that the number of iterations "
713 "can be counted in runtime\n");
717 if (desc
->const_iter
)
720 fprintf (dump_file
, ";; Loop iterates constant times\n");
724 /* Check whether the loop rolls. */
725 if ((get_estimated_loop_iterations (loop
, &iterations
)
726 || get_likely_max_loop_iterations (loop
, &iterations
))
727 && wi::ltu_p (iterations
, 2 * nunroll
))
730 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
734 /* Success; now force nunroll to be power of 2, as code-gen
735 requires it, we are unable to cope with overflows in
736 computation of number of iterations. */
737 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
740 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
741 loop
->lpt_decision
.times
= i
- 1;
744 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
745 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
746 and NULL is returned instead. */
749 split_edge_and_insert (edge e
, rtx_insn
*insns
)
756 emit_insn_after (insns
, BB_END (bb
));
758 /* ??? We used to assume that INSNS can contain control flow insns, and
759 that we had to try to find sub basic blocks in BB to maintain a valid
760 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
761 and call break_superblocks when going out of cfglayout mode. But it
762 turns out that this never happens; and that if it does ever happen,
763 the verify_flow_info at the end of the RTL loop passes would fail.
765 There are two reasons why we expected we could have control flow insns
766 in INSNS. The first is when a comparison has to be done in parts, and
767 the second is when the number of iterations is computed for loops with
768 the number of iterations known at runtime. In both cases, test cases
769 to get control flow in INSNS appear to be impossible to construct:
771 * If do_compare_rtx_and_jump needs several branches to do comparison
772 in a mode that needs comparison by parts, we cannot analyze the
773 number of iterations of the loop, and we never get to unrolling it.
775 * The code in expand_divmod that was suspected to cause creation of
776 branching code seems to be only accessed for signed division. The
777 divisions used by # of iterations analysis are always unsigned.
778 Problems might arise on architectures that emits branching code
779 for some operations that may appear in the unroller (especially
780 for division), but we have no such architectures.
782 Considering all this, it was decided that we should for now assume
783 that INSNS can in theory contain control flow insns, but in practice
784 it never does. So we don't handle the theoretical case, and should
785 a real failure ever show up, we have a pretty good clue for how to
791 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
792 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
793 in order to create a jump. */
796 compare_and_jump_seq (rtx op0
, rtx op1
, enum rtx_code comp
,
797 rtx_code_label
*label
, profile_probability prob
,
805 mode
= GET_MODE (op0
);
806 if (mode
== VOIDmode
)
807 mode
= GET_MODE (op1
);
810 if (GET_MODE_CLASS (mode
) == MODE_CC
)
812 /* A hack -- there seems to be no easy generic way how to make a
813 conditional jump from a ccmode comparison. */
815 cond
= XEXP (SET_SRC (pc_set (cinsn
)), 0);
816 gcc_assert (GET_CODE (cond
) == comp
);
817 gcc_assert (rtx_equal_p (op0
, XEXP (cond
, 0)));
818 gcc_assert (rtx_equal_p (op1
, XEXP (cond
, 1)));
819 emit_jump_insn (copy_insn (PATTERN (cinsn
)));
820 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
821 JUMP_LABEL (jump
) = JUMP_LABEL (cinsn
);
822 LABEL_NUSES (JUMP_LABEL (jump
))++;
823 redirect_jump (jump
, label
, 0);
829 op0
= force_operand (op0
, NULL_RTX
);
830 op1
= force_operand (op1
, NULL_RTX
);
831 do_compare_rtx_and_jump (op0
, op1
, comp
, 0,
832 mode
, NULL_RTX
, NULL
, label
,
833 profile_probability::uninitialized ());
834 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
835 jump
->set_jump_target (label
);
836 LABEL_NUSES (label
)++;
838 if (prob
.initialized_p ())
839 add_reg_br_prob_note (jump
, prob
);
847 /* Unroll LOOP for which we are able to count number of iterations in
848 runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
849 power of two. The transformation does this (with some extra care
852 for (i = 0; i < n; i++)
855 ==> (LOOP->LPT_DECISION.TIMES == 3)
880 unroll_loop_runtime_iterations (class loop
*loop
)
882 rtx old_niter
, niter
, tmp
;
883 rtx_insn
*init_code
, *branch_code
;
885 profile_probability p
;
886 basic_block preheader
, *body
, swtch
, ezc_swtch
= NULL
;
888 profile_count iter_count
, new_count
;
891 bool extra_zero_check
, last_may_exit
;
892 unsigned max_unroll
= loop
->lpt_decision
.times
;
893 class niter_desc
*desc
= get_simple_loop_desc (loop
);
894 bool exit_at_end
= loop_exit_at_end_p (loop
);
895 struct opt_info
*opt_info
= NULL
;
898 if (flag_split_ivs_in_unroller
899 || flag_variable_expansion_in_unroller
)
900 opt_info
= analyze_insns_in_loop (loop
);
902 /* Remember blocks whose dominators will have to be updated. */
903 auto_vec
<basic_block
> dom_bbs
;
905 body
= get_loop_body (loop
);
906 for (i
= 0; i
< loop
->num_nodes
; i
++)
908 for (basic_block bb
: get_dominated_by (CDI_DOMINATORS
, body
[i
]))
909 if (!flow_bb_inside_loop_p (loop
, bb
))
910 dom_bbs
.safe_push (bb
);
916 /* Leave exit in first copy (for explanation why see comment in
917 unroll_loop_constant_iterations). */
919 n_peel
= max_unroll
- 1;
920 extra_zero_check
= true;
921 last_may_exit
= false;
925 /* Leave exit in last copy (for explanation why see comment in
926 unroll_loop_constant_iterations). */
927 may_exit_copy
= max_unroll
;
929 extra_zero_check
= false;
930 last_may_exit
= true;
933 /* Get expression for number of iterations. */
935 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
936 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
938 emit_move_insn (niter
, tmp
);
940 /* For loops that exit at end and whose number of iterations is reliable,
941 add one to niter to account for first pass through loop body before
942 reaching exit test. */
943 if (exit_at_end
&& !desc
->noloop_assumptions
)
945 niter
= expand_simple_binop (desc
->mode
, PLUS
,
947 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
951 /* Count modulo by ANDing it with max_unroll; we use the fact that
952 the number of unrollings is a power of two, and thus this is correct
953 even if there is overflow in the computation. */
954 niter
= expand_simple_binop (desc
->mode
, AND
,
955 niter
, gen_int_mode (max_unroll
, desc
->mode
),
956 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
958 init_code
= get_insns ();
960 unshare_all_rtl_in_chain (init_code
);
962 /* Precondition the loop. */
963 split_edge_and_insert (loop_preheader_edge (loop
), init_code
);
965 auto_vec
<edge
> remove_edges
;
967 auto_sbitmap
wont_exit (max_unroll
+ 2);
969 if (extra_zero_check
|| desc
->noloop_assumptions
)
971 /* Peel the first copy of loop body. Leave the exit test if the number
972 of iterations is not reliable. Also record the place of the extra zero
974 bitmap_clear (wont_exit
);
975 if (!desc
->noloop_assumptions
)
976 bitmap_set_bit (wont_exit
, 1);
977 ezc_swtch
= loop_preheader_edge (loop
)->src
;
978 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
979 1, wont_exit
, desc
->out_edge
,
981 DLTHE_FLAG_UPDATE_FREQ
);
985 /* Record the place where switch will be built for preconditioning. */
986 swtch
= split_edge (loop_preheader_edge (loop
));
988 /* Compute count increments for each switch block and initialize
989 innermost switch block. Switch blocks and peeled loop copies are built
990 from innermost outward. */
991 iter_count
= new_count
= swtch
->count
.apply_scale (1, max_unroll
+ 1);
992 swtch
->count
= new_count
;
994 for (i
= 0; i
< n_peel
; i
++)
997 bitmap_clear (wont_exit
);
998 if (i
!= n_peel
- 1 || !last_may_exit
)
999 bitmap_set_bit (wont_exit
, 1);
1000 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
1001 1, wont_exit
, desc
->out_edge
,
1003 DLTHE_FLAG_UPDATE_FREQ
);
1006 /* Create item for switch. */
1007 unsigned j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
1008 p
= profile_probability::always ().apply_scale (1, i
+ 2);
1010 preheader
= split_edge (loop_preheader_edge (loop
));
1011 /* Add in count of edge from switch block. */
1012 preheader
->count
+= iter_count
;
1013 branch_code
= compare_and_jump_seq (copy_rtx (niter
),
1014 gen_int_mode (j
, desc
->mode
), EQ
,
1015 block_label (preheader
), p
, NULL
);
1017 /* We rely on the fact that the compare and jump cannot be optimized out,
1018 and hence the cfg we create is correct. */
1019 gcc_assert (branch_code
!= NULL_RTX
);
1021 swtch
= split_edge_and_insert (single_pred_edge (swtch
), branch_code
);
1022 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1023 single_succ_edge (swtch
)->probability
= p
.invert ();
1024 new_count
+= iter_count
;
1025 swtch
->count
= new_count
;
1026 e
= make_edge (swtch
, preheader
,
1027 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1031 if (extra_zero_check
)
1033 /* Add branch for zero iterations. */
1034 p
= profile_probability::always ().apply_scale (1, max_unroll
+ 1);
1036 preheader
= split_edge (loop_preheader_edge (loop
));
1037 /* Recompute count adjustments since initial peel copy may
1038 have exited and reduced those values that were computed above. */
1039 iter_count
= swtch
->count
.apply_scale (1, max_unroll
+ 1);
1040 /* Add in count of edge from switch block. */
1041 preheader
->count
+= iter_count
;
1042 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
1043 block_label (preheader
), p
,
1045 gcc_assert (branch_code
!= NULL_RTX
);
1047 swtch
= split_edge_and_insert (single_succ_edge (swtch
), branch_code
);
1048 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1049 single_succ_edge (swtch
)->probability
= p
.invert ();
1050 e
= make_edge (swtch
, preheader
,
1051 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1055 /* Recount dominators for outer blocks. */
1056 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, false);
1058 /* And unroll loop. */
1060 bitmap_ones (wont_exit
);
1061 bitmap_clear_bit (wont_exit
, may_exit_copy
);
1062 opt_info_start_duplication (opt_info
);
1064 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1066 wont_exit
, desc
->out_edge
,
1068 DLTHE_FLAG_UPDATE_FREQ
1070 ? DLTHE_RECORD_COPY_NUMBER
1076 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
1077 free_opt_info (opt_info
);
1082 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
1083 /* Find a new in and out edge; they are in the last copy we have
1086 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
1088 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
1089 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
1093 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
1094 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
1098 /* Remove the edges. */
1099 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
1102 /* We must be careful when updating the number of iterations due to
1103 preconditioning and the fact that the value must be valid at entry
1104 of the loop. After passing through the above code, we see that
1105 the correct new number of iterations is this: */
1106 gcc_assert (!desc
->const_iter
);
1108 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
,
1109 gen_int_mode (max_unroll
+ 1, desc
->mode
));
1110 loop
->nb_iterations_upper_bound
1111 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
1112 if (loop
->any_estimate
)
1113 loop
->nb_iterations_estimate
1114 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
1115 if (loop
->any_likely_upper_bound
)
1116 loop
->nb_iterations_likely_upper_bound
1117 = wi::udiv_trunc (loop
->nb_iterations_likely_upper_bound
, max_unroll
+ 1);
1121 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1122 desc
->noloop_assumptions
= NULL_RTX
;
1123 --loop
->nb_iterations_upper_bound
;
1124 if (loop
->any_estimate
1125 && loop
->nb_iterations_estimate
!= 0)
1126 --loop
->nb_iterations_estimate
;
1128 loop
->any_estimate
= false;
1129 if (loop
->any_likely_upper_bound
1130 && loop
->nb_iterations_likely_upper_bound
!= 0)
1131 --loop
->nb_iterations_likely_upper_bound
;
1133 loop
->any_likely_upper_bound
= false;
1138 ";; Unrolled loop %d times, counting # of iterations "
1139 "in runtime, %i insns\n",
1140 max_unroll
, num_loop_insns (loop
));
1143 /* Decide whether to unroll LOOP stupidly and how much. */
1145 decide_unroll_stupid (class loop
*loop
, int flags
)
1147 unsigned nunroll
, nunroll_by_av
, i
;
1148 class niter_desc
*desc
;
1149 widest_int iterations
;
1151 /* If we were not asked to unroll this loop, just return back silently. */
1152 if (!(flags
& UAP_UNROLL_ALL
) && !loop
->unroll
)
1155 if (dump_enabled_p ())
1156 dump_printf (MSG_NOTE
, "considering unrolling loop stupidly\n");
1158 /* nunroll = total number of copies of the original loop body in
1159 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1160 nunroll
= param_max_unrolled_insns
/ loop
->ninsns
;
1162 = param_max_average_unrolled_insns
/ loop
->av_ninsns
;
1163 if (nunroll
> nunroll_by_av
)
1164 nunroll
= nunroll_by_av
;
1165 if (nunroll
> (unsigned) param_max_unroll_times
)
1166 nunroll
= param_max_unroll_times
;
1168 if (targetm
.loop_unroll_adjust
)
1169 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
1171 if (loop
->unroll
> 0 && loop
->unroll
< USHRT_MAX
)
1172 nunroll
= loop
->unroll
;
1174 /* Skip big loops. */
1178 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1182 /* Check for simple loops. */
1183 desc
= get_simple_loop_desc (loop
);
1185 /* Check simpleness. */
1186 if (desc
->simple_p
&& !desc
->assumptions
)
1189 fprintf (dump_file
, ";; Loop is simple\n");
1193 /* Do not unroll loops with branches inside -- it increases number
1195 TODO: this heuristic needs tunning; call inside the loop body
1196 is also relatively good reason to not unroll. */
1197 if (num_loop_branches (loop
) > 1)
1200 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1204 /* Check whether the loop rolls. */
1205 if ((get_estimated_loop_iterations (loop
, &iterations
)
1206 || get_likely_max_loop_iterations (loop
, &iterations
))
1207 && wi::ltu_p (iterations
, 2 * nunroll
))
1210 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1214 /* Success. Now force nunroll to be power of 2, as it seems that this
1215 improves results (partially because of better alignments, partially
1216 because of some dark magic). */
1217 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1220 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1221 loop
->lpt_decision
.times
= i
- 1;
1224 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1229 ==> (LOOP->LPT_DECISION.TIMES == 3)
1243 unroll_loop_stupid (class loop
*loop
)
1245 unsigned nunroll
= loop
->lpt_decision
.times
;
1246 class niter_desc
*desc
= get_simple_loop_desc (loop
);
1247 struct opt_info
*opt_info
= NULL
;
1250 if (flag_split_ivs_in_unroller
1251 || flag_variable_expansion_in_unroller
)
1252 opt_info
= analyze_insns_in_loop (loop
);
1254 auto_sbitmap
wont_exit (nunroll
+ 1);
1255 bitmap_clear (wont_exit
);
1256 opt_info_start_duplication (opt_info
);
1258 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1261 DLTHE_FLAG_UPDATE_FREQ
1263 ? DLTHE_RECORD_COPY_NUMBER
1269 apply_opt_in_copies (opt_info
, nunroll
, true, true);
1270 free_opt_info (opt_info
);
1275 /* We indeed may get here provided that there are nontrivial assumptions
1276 for a loop to be really simple. We could update the counts, but the
1277 problem is that we are unable to decide which exit will be taken
1278 (not really true in case the number of iterations is constant,
1279 but no one will do anything with this information, so we do not
1281 desc
->simple_p
= false;
1285 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1286 nunroll
, num_loop_insns (loop
));
1289 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1290 Set *DEBUG_USES to the number of debug insns that reference the
1294 referenced_in_one_insn_in_loop_p (class loop
*loop
, rtx reg
,
1297 basic_block
*body
, bb
;
1302 body
= get_loop_body (loop
);
1303 for (i
= 0; i
< loop
->num_nodes
; i
++)
1307 FOR_BB_INSNS (bb
, insn
)
1308 if (!rtx_referenced_p (reg
, insn
))
1310 else if (DEBUG_INSN_P (insn
))
1312 else if (++count_ref
> 1)
1316 return (count_ref
== 1);
1319 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1322 reset_debug_uses_in_loop (class loop
*loop
, rtx reg
, int debug_uses
)
1324 basic_block
*body
, bb
;
1328 body
= get_loop_body (loop
);
1329 for (i
= 0; debug_uses
&& i
< loop
->num_nodes
; i
++)
1333 FOR_BB_INSNS (bb
, insn
)
1334 if (!DEBUG_INSN_P (insn
) || !rtx_referenced_p (reg
, insn
))
1338 validate_change (insn
, &INSN_VAR_LOCATION_LOC (insn
),
1339 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1347 /* Determine whether INSN contains an accumulator
1348 which can be expanded into separate copies,
1349 one for each copy of the LOOP body.
1351 for (i = 0 ; i < n; i++)
1365 Return NULL if INSN contains no opportunity for expansion of accumulator.
1366 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1367 information and return a pointer to it.
1370 static struct var_to_expand
*
1371 analyze_insn_to_expand_var (class loop
*loop
, rtx_insn
*insn
)
1374 struct var_to_expand
*ves
;
1379 set
= single_set (insn
);
1383 dest
= SET_DEST (set
);
1384 src
= SET_SRC (set
);
1385 code
= GET_CODE (src
);
1387 if (code
!= PLUS
&& code
!= MINUS
&& code
!= MULT
&& code
!= FMA
)
1390 if (FLOAT_MODE_P (GET_MODE (dest
)))
1392 if (!flag_associative_math
)
1394 /* In the case of FMA, we're also changing the rounding. */
1395 if (code
== FMA
&& !flag_unsafe_math_optimizations
)
1399 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1400 in MD. But if there is no optab to generate the insn, we cannot
1401 perform the variable expansion. This can happen if an MD provides
1402 an insn but not a named pattern to generate it, for example to avoid
1403 producing code that needs additional mode switches like for x87/mmx.
1405 So we check have_insn_for which looks for an optab for the operation
1406 in SRC. If it doesn't exist, we can't perform the expansion even
1407 though INSN is valid. */
1408 if (!have_insn_for (code
, GET_MODE (src
)))
1412 && !(GET_CODE (dest
) == SUBREG
1413 && REG_P (SUBREG_REG (dest
))))
1416 /* Find the accumulator use within the operation. */
1419 /* We only support accumulation via FMA in the ADD position. */
1420 if (!rtx_equal_p (dest
, XEXP (src
, 2)))
1424 else if (rtx_equal_p (dest
, XEXP (src
, 0)))
1426 else if (rtx_equal_p (dest
, XEXP (src
, 1)))
1428 /* The method of expansion that we are using; which includes the
1429 initialization of the expansions with zero and the summation of
1430 the expansions at the end of the computation will yield wrong
1431 results for (x = something - x) thus avoid using it in that case. */
1439 /* It must not otherwise be used. */
1442 if (rtx_referenced_p (dest
, XEXP (src
, 0))
1443 || rtx_referenced_p (dest
, XEXP (src
, 1)))
1446 else if (rtx_referenced_p (dest
, XEXP (src
, 1 - accum_pos
)))
1449 /* It must be used in exactly one insn. */
1450 if (!referenced_in_one_insn_in_loop_p (loop
, dest
, &debug_uses
))
1455 fprintf (dump_file
, "\n;; Expanding Accumulator ");
1456 print_rtl (dump_file
, dest
);
1457 fprintf (dump_file
, "\n");
1461 /* Instead of resetting the debug insns, we could replace each
1462 debug use in the loop with the sum or product of all expanded
1463 accumulators. Since we'll only know of all expansions at the
1464 end, we'd have to keep track of which vars_to_expand a debug
1465 insn in the loop references, take note of each copy of the
1466 debug insn during unrolling, and when it's all done, compute
1467 the sum or product of each variable and adjust the original
1468 debug insn and each copy thereof. What a pain! */
1469 reset_debug_uses_in_loop (loop
, dest
, debug_uses
);
1471 /* Record the accumulator to expand. */
1472 ves
= XNEW (struct var_to_expand
);
1474 ves
->reg
= copy_rtx (dest
);
1475 ves
->var_expansions
.create (1);
1477 ves
->op
= GET_CODE (src
);
1478 ves
->expansion_count
= 0;
1479 ves
->reuse_expansion
= 0;
1483 /* Determine whether there is an induction variable in INSN that
1484 we would like to split during unrolling.
1504 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1505 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1508 static struct iv_to_split
*
1509 analyze_iv_to_split_insn (rtx_insn
*insn
)
1513 struct iv_to_split
*ivts
;
1514 scalar_int_mode mode
;
1517 /* For now we just split the basic induction variables. Later this may be
1518 extended for example by selecting also addresses of memory references. */
1519 set
= single_set (insn
);
1523 dest
= SET_DEST (set
);
1524 if (!REG_P (dest
) || !is_a
<scalar_int_mode
> (GET_MODE (dest
), &mode
))
1527 if (!biv_p (insn
, mode
, dest
))
1530 ok
= iv_analyze_result (insn
, dest
, &iv
);
1532 /* This used to be an assert under the assumption that if biv_p returns
1533 true that iv_analyze_result must also return true. However, that
1534 assumption is not strictly correct as evidenced by pr25569.
1536 Returning NULL when iv_analyze_result returns false is safe and
1537 avoids the problems in pr25569 until the iv_analyze_* routines
1538 can be fixed, which is apparently hard and time consuming
1539 according to their author. */
1543 if (iv
.step
== const0_rtx
1544 || iv
.mode
!= iv
.extend_mode
)
1547 /* Record the insn to split. */
1548 ivts
= XNEW (struct iv_to_split
);
1550 ivts
->orig_var
= dest
;
1551 ivts
->base_var
= NULL_RTX
;
1552 ivts
->step
= iv
.step
;
1558 /* Determines which of insns in LOOP can be optimized.
1559 Return a OPT_INFO struct with the relevant hash tables filled
1560 with all insns to be optimized. The FIRST_NEW_BLOCK field
1561 is undefined for the return value. */
1563 static struct opt_info
*
1564 analyze_insns_in_loop (class loop
*loop
)
1566 basic_block
*body
, bb
;
1568 struct opt_info
*opt_info
= XCNEW (struct opt_info
);
1570 struct iv_to_split
*ivts
= NULL
;
1571 struct var_to_expand
*ves
= NULL
;
1572 iv_to_split
**slot1
;
1573 var_to_expand
**slot2
;
1574 auto_vec
<edge
> edges
= get_loop_exit_edges (loop
);
1576 bool can_apply
= false;
1578 iv_analysis_loop_init (loop
);
1580 body
= get_loop_body (loop
);
1582 if (flag_split_ivs_in_unroller
)
1584 opt_info
->insns_to_split
1585 = new hash_table
<iv_split_hasher
> (5 * loop
->num_nodes
);
1586 opt_info
->iv_to_split_head
= NULL
;
1587 opt_info
->iv_to_split_tail
= &opt_info
->iv_to_split_head
;
1590 /* Record the loop exit bb and loop preheader before the unrolling. */
1591 opt_info
->loop_preheader
= loop_preheader_edge (loop
)->src
;
1593 if (edges
.length () == 1)
1596 if (!(exit
->flags
& EDGE_COMPLEX
))
1598 opt_info
->loop_exit
= split_edge (exit
);
1603 if (flag_variable_expansion_in_unroller
1606 opt_info
->insns_with_var_to_expand
1607 = new hash_table
<var_expand_hasher
> (5 * loop
->num_nodes
);
1608 opt_info
->var_to_expand_head
= NULL
;
1609 opt_info
->var_to_expand_tail
= &opt_info
->var_to_expand_head
;
1612 for (i
= 0; i
< loop
->num_nodes
; i
++)
1615 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1618 FOR_BB_INSNS (bb
, insn
)
1623 if (opt_info
->insns_to_split
)
1624 ivts
= analyze_iv_to_split_insn (insn
);
1628 slot1
= opt_info
->insns_to_split
->find_slot (ivts
, INSERT
);
1629 gcc_assert (*slot1
== NULL
);
1631 *opt_info
->iv_to_split_tail
= ivts
;
1632 opt_info
->iv_to_split_tail
= &ivts
->next
;
1636 if (opt_info
->insns_with_var_to_expand
)
1637 ves
= analyze_insn_to_expand_var (loop
, insn
);
1641 slot2
= opt_info
->insns_with_var_to_expand
->find_slot (ves
, INSERT
);
1642 gcc_assert (*slot2
== NULL
);
1644 *opt_info
->var_to_expand_tail
= ves
;
1645 opt_info
->var_to_expand_tail
= &ves
->next
;
1654 /* Called just before loop duplication. Records start of duplicated area
1658 opt_info_start_duplication (struct opt_info
*opt_info
)
1661 opt_info
->first_new_block
= last_basic_block_for_fn (cfun
);
1664 /* Determine the number of iterations between initialization of the base
1665 variable and the current copy (N_COPY). N_COPIES is the total number
1666 of newly created copies. UNROLLING is true if we are unrolling
1667 (not peeling) the loop. */
1670 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1674 /* If we are unrolling, initialization is done in the original loop
1680 /* If we are peeling, the copy in that the initialization occurs has
1681 number 1. The original loop (number 0) is the last. */
1689 /* Allocate basic variable for the induction variable chain. */
1692 allocate_basic_variable (struct iv_to_split
*ivts
)
1694 rtx expr
= SET_SRC (single_set (ivts
->insn
));
1696 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1699 /* Insert initialization of basic variable of IVTS before INSN, taking
1700 the initial value from INSN. */
1703 insert_base_initialization (struct iv_to_split
*ivts
, rtx_insn
*insn
)
1705 rtx expr
= copy_rtx (SET_SRC (single_set (insn
)));
1709 expr
= force_operand (expr
, ivts
->base_var
);
1710 if (expr
!= ivts
->base_var
)
1711 emit_move_insn (ivts
->base_var
, expr
);
1715 emit_insn_before (seq
, insn
);
1718 /* Replace the use of induction variable described in IVTS in INSN
1719 by base variable + DELTA * step. */
1722 split_iv (struct iv_to_split
*ivts
, rtx_insn
*insn
, unsigned delta
)
1724 rtx expr
, *loc
, incr
, var
;
1726 machine_mode mode
= GET_MODE (ivts
->base_var
);
1729 /* Construct base + DELTA * step. */
1731 expr
= ivts
->base_var
;
1734 incr
= simplify_gen_binary (MULT
, mode
,
1735 copy_rtx (ivts
->step
),
1736 gen_int_mode (delta
, mode
));
1737 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1738 ivts
->base_var
, incr
);
1741 /* Figure out where to do the replacement. */
1742 loc
= &SET_SRC (single_set (insn
));
1744 /* If we can make the replacement right away, we're done. */
1745 if (validate_change (insn
, loc
, expr
, 0))
1748 /* Otherwise, force EXPR into a register and try again. */
1750 var
= gen_reg_rtx (mode
);
1751 expr
= force_operand (expr
, var
);
1753 emit_move_insn (var
, expr
);
1756 emit_insn_before (seq
, insn
);
1758 if (validate_change (insn
, loc
, var
, 0))
1761 /* The last chance. Try recreating the assignment in insn
1762 completely from scratch. */
1763 set
= single_set (insn
);
1768 src
= copy_rtx (SET_SRC (set
));
1769 dest
= copy_rtx (SET_DEST (set
));
1770 src
= force_operand (src
, dest
);
1772 emit_move_insn (dest
, src
);
1776 emit_insn_before (seq
, insn
);
1781 /* Return one expansion of the accumulator recorded in struct VE. */
1784 get_expansion (struct var_to_expand
*ve
)
1788 if (ve
->reuse_expansion
== 0)
1791 reg
= ve
->var_expansions
[ve
->reuse_expansion
- 1];
1793 if (ve
->var_expansions
.length () == (unsigned) ve
->reuse_expansion
)
1794 ve
->reuse_expansion
= 0;
1796 ve
->reuse_expansion
++;
1802 /* Given INSN replace the uses of the accumulator recorded in VE
1803 with a new register. */
1806 expand_var_during_unrolling (struct var_to_expand
*ve
, rtx_insn
*insn
)
1809 bool really_new_expansion
= false;
1811 set
= single_set (insn
);
1814 /* Generate a new register only if the expansion limit has not been
1815 reached. Else reuse an already existing expansion. */
1816 if (param_max_variable_expansions
> ve
->expansion_count
)
1818 really_new_expansion
= true;
1819 new_reg
= gen_reg_rtx (GET_MODE (ve
->reg
));
1822 new_reg
= get_expansion (ve
);
1824 validate_replace_rtx_group (SET_DEST (set
), new_reg
, insn
);
1825 if (apply_change_group ())
1826 if (really_new_expansion
)
1828 ve
->var_expansions
.safe_push (new_reg
);
1829 ve
->expansion_count
++;
1833 /* Initialize the variable expansions in loop preheader. PLACE is the
1834 loop-preheader basic block where the initialization of the
1835 expansions should take place. The expansions are initialized with
1836 (-0) when the operation is plus or minus to honor sign zero. This
1837 way we can prevent cases where the sign of the final result is
1838 effected by the sign of the expansion. Here is an example to
1841 for (i = 0 ; i < n; i++)
1855 When SUM is initialized with -zero and SOMETHING is also -zero; the
1856 final result of sum should be -zero thus the expansions sum1 and sum2
1857 should be initialized with -zero as well (otherwise we will get +zero
1858 as the final result). */
1861 insert_var_expansion_initialization (struct var_to_expand
*ve
,
1867 machine_mode mode
= GET_MODE (ve
->reg
);
1868 bool honor_signed_zero_p
= HONOR_SIGNED_ZEROS (mode
);
1870 if (ve
->var_expansions
.length () == 0)
1877 /* Note that we only accumulate FMA via the ADD operand. */
1880 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1882 if (honor_signed_zero_p
)
1883 zero_init
= simplify_gen_unary (NEG
, mode
, CONST0_RTX (mode
), mode
);
1885 zero_init
= CONST0_RTX (mode
);
1886 emit_move_insn (var
, zero_init
);
1891 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1893 zero_init
= CONST1_RTX (GET_MODE (var
));
1894 emit_move_insn (var
, zero_init
);
1905 emit_insn_after (seq
, BB_END (place
));
1908 /* Combine the variable expansions at the loop exit. PLACE is the
1909 loop exit basic block where the summation of the expansions should
1913 combine_var_copies_in_loop_exit (struct var_to_expand
*ve
, basic_block place
)
1917 rtx_insn
*seq
, *insn
;
1920 if (ve
->var_expansions
.length () == 0)
1923 /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1924 it both here and as the destination of the assignment. */
1925 sum
= copy_rtx (sum
);
1930 /* Note that we only accumulate FMA via the ADD operand. */
1933 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1934 sum
= simplify_gen_binary (PLUS
, GET_MODE (ve
->reg
), var
, sum
);
1938 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1939 sum
= simplify_gen_binary (MULT
, GET_MODE (ve
->reg
), var
, sum
);
1946 expr
= force_operand (sum
, ve
->reg
);
1947 if (expr
!= ve
->reg
)
1948 emit_move_insn (ve
->reg
, expr
);
1952 insn
= BB_HEAD (place
);
1953 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
1954 insn
= NEXT_INSN (insn
);
1956 emit_insn_after (seq
, insn
);
1959 /* Strip away REG_EQUAL notes for IVs we're splitting.
1961 Updating REG_EQUAL notes for IVs we split is tricky: We
1962 cannot tell until after unrolling, DF-rescanning, and liveness
1963 updating, whether an EQ_USE is reached by the split IV while
1964 the IV reg is still live. See PR55006.
1966 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1967 because RTL loop-iv requires us to defer rescanning insns and
1968 any notes attached to them. So resort to old techniques... */
1971 maybe_strip_eq_note_for_split_iv (struct opt_info
*opt_info
, rtx_insn
*insn
)
1973 struct iv_to_split
*ivts
;
1974 rtx note
= find_reg_equal_equiv_note (insn
);
1977 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1978 if (reg_mentioned_p (ivts
->orig_var
, note
))
1980 remove_note (insn
, note
);
1985 /* Apply loop optimizations in loop copies using the
1986 data which gathered during the unrolling. Structure
1987 OPT_INFO record that data.
1989 UNROLLING is true if we unrolled (not peeled) the loop.
1990 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1991 the loop (as it should happen in complete unrolling, but not in ordinary
1992 peeling of the loop). */
1995 apply_opt_in_copies (struct opt_info
*opt_info
,
1996 unsigned n_copies
, bool unrolling
,
1997 bool rewrite_original_loop
)
2000 basic_block bb
, orig_bb
;
2001 rtx_insn
*insn
, *orig_insn
, *next
;
2002 struct iv_to_split ivts_templ
, *ivts
;
2003 struct var_to_expand ve_templ
, *ves
;
2005 /* Sanity check -- we need to put initialization in the original loop
2007 gcc_assert (!unrolling
|| rewrite_original_loop
);
2009 /* Allocate the basic variables (i0). */
2010 if (opt_info
->insns_to_split
)
2011 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
2012 allocate_basic_variable (ivts
);
2014 for (i
= opt_info
->first_new_block
;
2015 i
< (unsigned) last_basic_block_for_fn (cfun
);
2018 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2019 orig_bb
= get_bb_original (bb
);
2021 /* bb->aux holds position in copy sequence initialized by
2022 duplicate_loop_to_header_edge. */
2023 delta
= determine_split_iv_delta ((size_t)bb
->aux
, n_copies
,
2026 orig_insn
= BB_HEAD (orig_bb
);
2027 FOR_BB_INSNS_SAFE (bb
, insn
, next
)
2030 || (DEBUG_BIND_INSN_P (insn
)
2031 && INSN_VAR_LOCATION_DECL (insn
)
2032 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn
)) == LABEL_DECL
))
2035 while (!INSN_P (orig_insn
)
2036 || (DEBUG_BIND_INSN_P (orig_insn
)
2037 && INSN_VAR_LOCATION_DECL (orig_insn
)
2038 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn
))
2040 orig_insn
= NEXT_INSN (orig_insn
);
2042 ivts_templ
.insn
= orig_insn
;
2043 ve_templ
.insn
= orig_insn
;
2045 /* Apply splitting iv optimization. */
2046 if (opt_info
->insns_to_split
)
2048 maybe_strip_eq_note_for_split_iv (opt_info
, insn
);
2050 ivts
= opt_info
->insns_to_split
->find (&ivts_templ
);
2054 gcc_assert (GET_CODE (PATTERN (insn
))
2055 == GET_CODE (PATTERN (orig_insn
)));
2058 insert_base_initialization (ivts
, insn
);
2059 split_iv (ivts
, insn
, delta
);
2062 /* Apply variable expansion optimization. */
2063 if (unrolling
&& opt_info
->insns_with_var_to_expand
)
2065 ves
= (struct var_to_expand
*)
2066 opt_info
->insns_with_var_to_expand
->find (&ve_templ
);
2069 gcc_assert (GET_CODE (PATTERN (insn
))
2070 == GET_CODE (PATTERN (orig_insn
)));
2071 expand_var_during_unrolling (ves
, insn
);
2074 orig_insn
= NEXT_INSN (orig_insn
);
2078 if (!rewrite_original_loop
)
2081 /* Initialize the variable expansions in the loop preheader
2082 and take care of combining them at the loop exit. */
2083 if (opt_info
->insns_with_var_to_expand
)
2085 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2086 insert_var_expansion_initialization (ves
, opt_info
->loop_preheader
);
2087 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2088 combine_var_copies_in_loop_exit (ves
, opt_info
->loop_exit
);
2091 /* Rewrite also the original loop body. Find them as originals of the blocks
2092 in the last copied iteration, i.e. those that have
2093 get_bb_copy (get_bb_original (bb)) == bb. */
2094 for (i
= opt_info
->first_new_block
;
2095 i
< (unsigned) last_basic_block_for_fn (cfun
);
2098 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2099 orig_bb
= get_bb_original (bb
);
2100 if (get_bb_copy (orig_bb
) != bb
)
2103 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
2104 for (orig_insn
= BB_HEAD (orig_bb
);
2105 orig_insn
!= NEXT_INSN (BB_END (bb
));
2108 next
= NEXT_INSN (orig_insn
);
2110 if (!INSN_P (orig_insn
))
2113 ivts_templ
.insn
= orig_insn
;
2114 if (opt_info
->insns_to_split
)
2116 maybe_strip_eq_note_for_split_iv (opt_info
, orig_insn
);
2118 ivts
= (struct iv_to_split
*)
2119 opt_info
->insns_to_split
->find (&ivts_templ
);
2123 insert_base_initialization (ivts
, orig_insn
);
2124 split_iv (ivts
, orig_insn
, delta
);
2133 /* Release OPT_INFO. */
2136 free_opt_info (struct opt_info
*opt_info
)
2138 delete opt_info
->insns_to_split
;
2139 opt_info
->insns_to_split
= NULL
;
2140 if (opt_info
->insns_with_var_to_expand
)
2142 struct var_to_expand
*ves
;
2144 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2145 ves
->var_expansions
.release ();
2146 delete opt_info
->insns_with_var_to_expand
;
2147 opt_info
->insns_with_var_to_expand
= NULL
;