2014-10-24 Christophe Lyon <christophe.lyon@linaro.org>
[official-gcc.git] / gcc / loop-unroll.c
blobd68b8467b3df68a1ecf1ee35272db68f29802682
1 /* Loop unrolling.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "profile.h"
29 #include "basic-block.h"
30 #include "cfgloop.h"
31 #include "params.h"
32 #include "expr.h"
33 #include "hash-table.h"
34 #include "recog.h"
35 #include "target.h"
36 #include "dumpfile.h"
38 /* This pass performs loop unrolling. We only perform this
39 optimization on innermost loops (with single exception) because
40 the impact on performance is greatest here, and we want to avoid
41 unnecessary code size growth. The gain is caused by greater sequentiality
42 of code, better code to optimize for further passes and in some cases
43 by fewer testings of exit conditions. The main problem is code growth,
44 that impacts performance negatively due to effect of caches.
46 What we do:
48 -- unrolling of loops that roll constant times; this is almost always
49 win, as we get rid of exit condition tests.
50 -- unrolling of loops that roll number of times that we can compute
51 in runtime; we also get rid of exit condition tests here, but there
52 is the extra expense for calculating the number of iterations
53 -- simple unrolling of remaining loops; this is performed only if we
54 are asked to, as the gain is questionable in this case and often
55 it may even slow down the code
56 For more detailed descriptions of each of those, see comments at
57 appropriate function below.
59 There is a lot of parameters (defined and described in params.def) that
60 control how much we unroll.
62 ??? A great problem is that we don't have a good way how to determine
63 how many times we should unroll the loop; the experiments I have made
64 showed that this choice may affect performance in order of several %.
67 /* Information about induction variables to split. */
69 struct iv_to_split
71 rtx_insn *insn; /* The insn in that the induction variable occurs. */
72 rtx orig_var; /* The variable (register) for the IV before split. */
73 rtx base_var; /* The variable on that the values in the further
74 iterations are based. */
75 rtx step; /* Step of the induction variable. */
76 struct iv_to_split *next; /* Next entry in walking order. */
79 /* Information about accumulators to expand. */
81 struct var_to_expand
83 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
84 rtx reg; /* The accumulator which is expanded. */
85 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
86 struct var_to_expand *next; /* Next entry in walking order. */
87 enum rtx_code op; /* The type of the accumulation - addition, subtraction
88 or multiplication. */
89 int expansion_count; /* Count the number of expansions generated so far. */
90 int reuse_expansion; /* The expansion we intend to reuse to expand
91 the accumulator. If REUSE_EXPANSION is 0 reuse
92 the original accumulator. Else use
93 var_expansions[REUSE_EXPANSION - 1]. */
96 /* Hashtable helper for iv_to_split. */
98 struct iv_split_hasher : typed_free_remove <iv_to_split>
100 typedef iv_to_split value_type;
101 typedef iv_to_split compare_type;
102 static inline hashval_t hash (const value_type *);
103 static inline bool equal (const value_type *, const compare_type *);
107 /* A hash function for information about insns to split. */
109 inline hashval_t
110 iv_split_hasher::hash (const value_type *ivts)
112 return (hashval_t) INSN_UID (ivts->insn);
115 /* An equality functions for information about insns to split. */
117 inline bool
118 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
120 return i1->insn == i2->insn;
123 /* Hashtable helper for iv_to_split. */
125 struct var_expand_hasher : typed_free_remove <var_to_expand>
127 typedef var_to_expand value_type;
128 typedef var_to_expand compare_type;
129 static inline hashval_t hash (const value_type *);
130 static inline bool equal (const value_type *, const compare_type *);
133 /* Return a hash for VES. */
135 inline hashval_t
136 var_expand_hasher::hash (const value_type *ves)
138 return (hashval_t) INSN_UID (ves->insn);
141 /* Return true if I1 and I2 refer to the same instruction. */
143 inline bool
144 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
146 return i1->insn == i2->insn;
149 /* Information about optimization applied in
150 the unrolled loop. */
152 struct opt_info
154 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
155 split. */
156 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
157 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
158 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
159 insns with accumulators to expand. */
160 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
161 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
162 unsigned first_new_block; /* The first basic block that was
163 duplicated. */
164 basic_block loop_exit; /* The loop exit basic block. */
165 basic_block loop_preheader; /* The loop preheader basic block. */
168 static void decide_unroll_stupid (struct loop *, int);
169 static void decide_unroll_constant_iterations (struct loop *, int);
170 static void decide_unroll_runtime_iterations (struct loop *, int);
171 static void unroll_loop_stupid (struct loop *);
172 static void decide_unrolling (int);
173 static void unroll_loop_constant_iterations (struct loop *);
174 static void unroll_loop_runtime_iterations (struct loop *);
175 static struct opt_info *analyze_insns_in_loop (struct loop *);
176 static void opt_info_start_duplication (struct opt_info *);
177 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
178 static void free_opt_info (struct opt_info *);
179 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
180 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
181 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
182 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
183 static void insert_var_expansion_initialization (struct var_to_expand *,
184 basic_block);
185 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
186 basic_block);
187 static rtx get_expansion (struct var_to_expand *);
189 /* Emit a message summarizing the unroll that will be
190 performed for LOOP, along with the loop's location LOCUS, if
191 appropriate given the dump or -fopt-info settings. */
193 static void
194 report_unroll (struct loop *loop, location_t locus)
196 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
198 if (loop->lpt_decision.decision == LPT_NONE)
199 return;
201 if (!dump_enabled_p ())
202 return;
204 dump_printf_loc (report_flags, locus,
205 "loop unrolled %d times",
206 loop->lpt_decision.times);
207 if (profile_info)
208 dump_printf (report_flags,
209 " (header execution count %d)",
210 (int)loop->header->count);
212 dump_printf (report_flags, "\n");
215 /* Decide whether unroll loops and how much. */
216 static void
217 decide_unrolling (int flags)
219 struct loop *loop;
221 /* Scan the loops, inner ones first. */
222 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
224 loop->lpt_decision.decision = LPT_NONE;
225 location_t locus = get_loop_location (loop);
227 if (dump_enabled_p ())
228 dump_printf_loc (TDF_RTL, locus,
229 ";; *** Considering loop %d at BB %d for "
230 "unrolling ***\n",
231 loop->num, loop->header->index);
233 /* Do not peel cold areas. */
234 if (optimize_loop_for_size_p (loop))
236 if (dump_file)
237 fprintf (dump_file, ";; Not considering loop, cold area\n");
238 continue;
241 /* Can the loop be manipulated? */
242 if (!can_duplicate_loop_p (loop))
244 if (dump_file)
245 fprintf (dump_file,
246 ";; Not considering loop, cannot duplicate\n");
247 continue;
250 /* Skip non-innermost loops. */
251 if (loop->inner)
253 if (dump_file)
254 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
255 continue;
258 loop->ninsns = num_loop_insns (loop);
259 loop->av_ninsns = average_num_loop_insns (loop);
261 /* Try transformations one by one in decreasing order of
262 priority. */
264 decide_unroll_constant_iterations (loop, flags);
265 if (loop->lpt_decision.decision == LPT_NONE)
266 decide_unroll_runtime_iterations (loop, flags);
267 if (loop->lpt_decision.decision == LPT_NONE)
268 decide_unroll_stupid (loop, flags);
270 report_unroll (loop, locus);
274 /* Unroll LOOPS. */
275 void
276 unroll_loops (int flags)
278 struct loop *loop;
279 bool changed = false;
281 /* Now decide rest of unrolling. */
282 decide_unrolling (flags);
284 /* Scan the loops, inner ones first. */
285 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
287 /* And perform the appropriate transformations. */
288 switch (loop->lpt_decision.decision)
290 case LPT_UNROLL_CONSTANT:
291 unroll_loop_constant_iterations (loop);
292 changed = true;
293 break;
294 case LPT_UNROLL_RUNTIME:
295 unroll_loop_runtime_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_STUPID:
299 unroll_loop_stupid (loop);
300 changed = true;
301 break;
302 case LPT_NONE:
303 break;
304 default:
305 gcc_unreachable ();
309 if (changed)
311 calculate_dominance_info (CDI_DOMINATORS);
312 fix_loop_structure (NULL);
315 iv_analysis_done ();
318 /* Check whether exit of the LOOP is at the end of loop body. */
320 static bool
321 loop_exit_at_end_p (struct loop *loop)
323 struct niter_desc *desc = get_simple_loop_desc (loop);
324 rtx_insn *insn;
326 /* We should never have conditional in latch block. */
327 gcc_assert (desc->in_edge->dest != loop->header);
329 if (desc->in_edge->dest != loop->latch)
330 return false;
332 /* Check that the latch is empty. */
333 FOR_BB_INSNS (loop->latch, insn)
335 if (INSN_P (insn) && active_insn_p (insn))
336 return false;
339 return true;
342 /* Decide whether to unroll LOOP iterating constant number of times
343 and how much. */
345 static void
346 decide_unroll_constant_iterations (struct loop *loop, int flags)
348 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
349 struct niter_desc *desc;
350 widest_int iterations;
352 if (!(flags & UAP_UNROLL))
354 /* We were not asked to, just return back silently. */
355 return;
358 if (dump_file)
359 fprintf (dump_file,
360 "\n;; Considering unrolling loop with constant "
361 "number of iterations\n");
363 /* nunroll = total number of copies of the original loop body in
364 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
365 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
366 nunroll_by_av
367 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
368 if (nunroll > nunroll_by_av)
369 nunroll = nunroll_by_av;
370 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
371 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
373 if (targetm.loop_unroll_adjust)
374 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
376 /* Skip big loops. */
377 if (nunroll <= 1)
379 if (dump_file)
380 fprintf (dump_file, ";; Not considering loop, is too big\n");
381 return;
384 /* Check for simple loops. */
385 desc = get_simple_loop_desc (loop);
387 /* Check number of iterations. */
388 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
390 if (dump_file)
391 fprintf (dump_file,
392 ";; Unable to prove that the loop iterates constant times\n");
393 return;
396 /* Check whether the loop rolls enough to consider.
397 Consult also loop bounds and profile; in the case the loop has more
398 than one exit it may well loop less than determined maximal number
399 of iterations. */
400 if (desc->niter < 2 * nunroll
401 || ((get_estimated_loop_iterations (loop, &iterations)
402 || get_max_loop_iterations (loop, &iterations))
403 && wi::ltu_p (iterations, 2 * nunroll)))
405 if (dump_file)
406 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
407 return;
410 /* Success; now compute number of iterations to unroll. We alter
411 nunroll so that as few as possible copies of loop body are
412 necessary, while still not decreasing the number of unrollings
413 too much (at most by 1). */
414 best_copies = 2 * nunroll + 10;
416 i = 2 * nunroll + 2;
417 if (i - 1 >= desc->niter)
418 i = desc->niter - 2;
420 for (; i >= nunroll - 1; i--)
422 unsigned exit_mod = desc->niter % (i + 1);
424 if (!loop_exit_at_end_p (loop))
425 n_copies = exit_mod + i + 1;
426 else if (exit_mod != (unsigned) i
427 || desc->noloop_assumptions != NULL_RTX)
428 n_copies = exit_mod + i + 2;
429 else
430 n_copies = i + 1;
432 if (n_copies < best_copies)
434 best_copies = n_copies;
435 best_unroll = i;
439 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
440 loop->lpt_decision.times = best_unroll;
443 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
444 The transformation does this:
446 for (i = 0; i < 102; i++)
447 body;
449 ==> (LOOP->LPT_DECISION.TIMES == 3)
451 i = 0;
452 body; i++;
453 body; i++;
454 while (i < 102)
456 body; i++;
457 body; i++;
458 body; i++;
459 body; i++;
462 static void
463 unroll_loop_constant_iterations (struct loop *loop)
465 unsigned HOST_WIDE_INT niter;
466 unsigned exit_mod;
467 sbitmap wont_exit;
468 unsigned i;
469 edge e;
470 unsigned max_unroll = loop->lpt_decision.times;
471 struct niter_desc *desc = get_simple_loop_desc (loop);
472 bool exit_at_end = loop_exit_at_end_p (loop);
473 struct opt_info *opt_info = NULL;
474 bool ok;
476 niter = desc->niter;
478 /* Should not get here (such loop should be peeled instead). */
479 gcc_assert (niter > max_unroll + 1);
481 exit_mod = niter % (max_unroll + 1);
483 wont_exit = sbitmap_alloc (max_unroll + 1);
484 bitmap_ones (wont_exit);
486 auto_vec<edge> remove_edges;
487 if (flag_split_ivs_in_unroller
488 || flag_variable_expansion_in_unroller)
489 opt_info = analyze_insns_in_loop (loop);
491 if (!exit_at_end)
493 /* The exit is not at the end of the loop; leave exit test
494 in the first copy, so that the loops that start with test
495 of exit condition have continuous body after unrolling. */
497 if (dump_file)
498 fprintf (dump_file, ";; Condition at beginning of loop.\n");
500 /* Peel exit_mod iterations. */
501 bitmap_clear_bit (wont_exit, 0);
502 if (desc->noloop_assumptions)
503 bitmap_clear_bit (wont_exit, 1);
505 if (exit_mod)
507 opt_info_start_duplication (opt_info);
508 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
509 exit_mod,
510 wont_exit, desc->out_edge,
511 &remove_edges,
512 DLTHE_FLAG_UPDATE_FREQ
513 | (opt_info && exit_mod > 1
514 ? DLTHE_RECORD_COPY_NUMBER
515 : 0));
516 gcc_assert (ok);
518 if (opt_info && exit_mod > 1)
519 apply_opt_in_copies (opt_info, exit_mod, false, false);
521 desc->noloop_assumptions = NULL_RTX;
522 desc->niter -= exit_mod;
523 loop->nb_iterations_upper_bound -= exit_mod;
524 if (loop->any_estimate
525 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
526 loop->nb_iterations_estimate -= exit_mod;
527 else
528 loop->any_estimate = false;
531 bitmap_set_bit (wont_exit, 1);
533 else
535 /* Leave exit test in last copy, for the same reason as above if
536 the loop tests the condition at the end of loop body. */
538 if (dump_file)
539 fprintf (dump_file, ";; Condition at end of loop.\n");
541 /* We know that niter >= max_unroll + 2; so we do not need to care of
542 case when we would exit before reaching the loop. So just peel
543 exit_mod + 1 iterations. */
544 if (exit_mod != max_unroll
545 || desc->noloop_assumptions)
547 bitmap_clear_bit (wont_exit, 0);
548 if (desc->noloop_assumptions)
549 bitmap_clear_bit (wont_exit, 1);
551 opt_info_start_duplication (opt_info);
552 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
553 exit_mod + 1,
554 wont_exit, desc->out_edge,
555 &remove_edges,
556 DLTHE_FLAG_UPDATE_FREQ
557 | (opt_info && exit_mod > 0
558 ? DLTHE_RECORD_COPY_NUMBER
559 : 0));
560 gcc_assert (ok);
562 if (opt_info && exit_mod > 0)
563 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
565 desc->niter -= exit_mod + 1;
566 loop->nb_iterations_upper_bound -= exit_mod + 1;
567 if (loop->any_estimate
568 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
569 loop->nb_iterations_estimate -= exit_mod + 1;
570 else
571 loop->any_estimate = false;
572 desc->noloop_assumptions = NULL_RTX;
574 bitmap_set_bit (wont_exit, 0);
575 bitmap_set_bit (wont_exit, 1);
578 bitmap_clear_bit (wont_exit, max_unroll);
581 /* Now unroll the loop. */
583 opt_info_start_duplication (opt_info);
584 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
585 max_unroll,
586 wont_exit, desc->out_edge,
587 &remove_edges,
588 DLTHE_FLAG_UPDATE_FREQ
589 | (opt_info
590 ? DLTHE_RECORD_COPY_NUMBER
591 : 0));
592 gcc_assert (ok);
594 if (opt_info)
596 apply_opt_in_copies (opt_info, max_unroll, true, true);
597 free_opt_info (opt_info);
600 free (wont_exit);
602 if (exit_at_end)
604 basic_block exit_block = get_bb_copy (desc->in_edge->src);
605 /* Find a new in and out edge; they are in the last copy we have made. */
607 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
609 desc->out_edge = EDGE_SUCC (exit_block, 0);
610 desc->in_edge = EDGE_SUCC (exit_block, 1);
612 else
614 desc->out_edge = EDGE_SUCC (exit_block, 1);
615 desc->in_edge = EDGE_SUCC (exit_block, 0);
619 desc->niter /= max_unroll + 1;
620 loop->nb_iterations_upper_bound
621 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
622 if (loop->any_estimate)
623 loop->nb_iterations_estimate
624 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
625 desc->niter_expr = GEN_INT (desc->niter);
627 /* Remove the edges. */
628 FOR_EACH_VEC_ELT (remove_edges, i, e)
629 remove_path (e);
631 if (dump_file)
632 fprintf (dump_file,
633 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
634 max_unroll, num_loop_insns (loop));
637 /* Decide whether to unroll LOOP iterating runtime computable number of times
638 and how much. */
639 static void
640 decide_unroll_runtime_iterations (struct loop *loop, int flags)
642 unsigned nunroll, nunroll_by_av, i;
643 struct niter_desc *desc;
644 widest_int iterations;
646 if (!(flags & UAP_UNROLL))
648 /* We were not asked to, just return back silently. */
649 return;
652 if (dump_file)
653 fprintf (dump_file,
654 "\n;; Considering unrolling loop with runtime "
655 "computable number of iterations\n");
657 /* nunroll = total number of copies of the original loop body in
658 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
659 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
660 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
661 if (nunroll > nunroll_by_av)
662 nunroll = nunroll_by_av;
663 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
664 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
666 if (targetm.loop_unroll_adjust)
667 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
669 /* Skip big loops. */
670 if (nunroll <= 1)
672 if (dump_file)
673 fprintf (dump_file, ";; Not considering loop, is too big\n");
674 return;
677 /* Check for simple loops. */
678 desc = get_simple_loop_desc (loop);
680 /* Check simpleness. */
681 if (!desc->simple_p || desc->assumptions)
683 if (dump_file)
684 fprintf (dump_file,
685 ";; Unable to prove that the number of iterations "
686 "can be counted in runtime\n");
687 return;
690 if (desc->const_iter)
692 if (dump_file)
693 fprintf (dump_file, ";; Loop iterates constant times\n");
694 return;
697 /* Check whether the loop rolls. */
698 if ((get_estimated_loop_iterations (loop, &iterations)
699 || get_max_loop_iterations (loop, &iterations))
700 && wi::ltu_p (iterations, 2 * nunroll))
702 if (dump_file)
703 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
704 return;
707 /* Success; now force nunroll to be power of 2, as we are unable to
708 cope with overflows in computation of number of iterations. */
709 for (i = 1; 2 * i <= nunroll; i *= 2)
710 continue;
712 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
713 loop->lpt_decision.times = i - 1;
716 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
717 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
718 and NULL is returned instead. */
720 basic_block
721 split_edge_and_insert (edge e, rtx_insn *insns)
723 basic_block bb;
725 if (!insns)
726 return NULL;
727 bb = split_edge (e);
728 emit_insn_after (insns, BB_END (bb));
730 /* ??? We used to assume that INSNS can contain control flow insns, and
731 that we had to try to find sub basic blocks in BB to maintain a valid
732 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
733 and call break_superblocks when going out of cfglayout mode. But it
734 turns out that this never happens; and that if it does ever happen,
735 the verify_flow_info at the end of the RTL loop passes would fail.
737 There are two reasons why we expected we could have control flow insns
738 in INSNS. The first is when a comparison has to be done in parts, and
739 the second is when the number of iterations is computed for loops with
740 the number of iterations known at runtime. In both cases, test cases
741 to get control flow in INSNS appear to be impossible to construct:
743 * If do_compare_rtx_and_jump needs several branches to do comparison
744 in a mode that needs comparison by parts, we cannot analyze the
745 number of iterations of the loop, and we never get to unrolling it.
747 * The code in expand_divmod that was suspected to cause creation of
748 branching code seems to be only accessed for signed division. The
749 divisions used by # of iterations analysis are always unsigned.
750 Problems might arise on architectures that emits branching code
751 for some operations that may appear in the unroller (especially
752 for division), but we have no such architectures.
754 Considering all this, it was decided that we should for now assume
755 that INSNS can in theory contain control flow insns, but in practice
756 it never does. So we don't handle the theoretical case, and should
757 a real failure ever show up, we have a pretty good clue for how to
758 fix it. */
760 return bb;
763 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
764 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
765 in order to create a jump. */
767 static rtx_insn *
768 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
769 rtx_insn *cinsn)
771 rtx_insn *seq, *jump;
772 rtx cond;
773 enum machine_mode mode;
775 mode = GET_MODE (op0);
776 if (mode == VOIDmode)
777 mode = GET_MODE (op1);
779 start_sequence ();
780 if (GET_MODE_CLASS (mode) == MODE_CC)
782 /* A hack -- there seems to be no easy generic way how to make a
783 conditional jump from a ccmode comparison. */
784 gcc_assert (cinsn);
785 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
786 gcc_assert (GET_CODE (cond) == comp);
787 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
788 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
789 emit_jump_insn (copy_insn (PATTERN (cinsn)));
790 jump = get_last_insn ();
791 gcc_assert (JUMP_P (jump));
792 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
793 LABEL_NUSES (JUMP_LABEL (jump))++;
794 redirect_jump (jump, label, 0);
796 else
798 gcc_assert (!cinsn);
800 op0 = force_operand (op0, NULL_RTX);
801 op1 = force_operand (op1, NULL_RTX);
802 do_compare_rtx_and_jump (op0, op1, comp, 0,
803 mode, NULL_RTX, NULL_RTX, label, -1);
804 jump = get_last_insn ();
805 gcc_assert (JUMP_P (jump));
806 JUMP_LABEL (jump) = label;
807 LABEL_NUSES (label)++;
809 add_int_reg_note (jump, REG_BR_PROB, prob);
811 seq = get_insns ();
812 end_sequence ();
814 return seq;
817 /* Unroll LOOP for which we are able to count number of iterations in runtime
818 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
819 extra care for case n < 0):
821 for (i = 0; i < n; i++)
822 body;
824 ==> (LOOP->LPT_DECISION.TIMES == 3)
826 i = 0;
827 mod = n % 4;
829 switch (mod)
831 case 3:
832 body; i++;
833 case 2:
834 body; i++;
835 case 1:
836 body; i++;
837 case 0: ;
840 while (i < n)
842 body; i++;
843 body; i++;
844 body; i++;
845 body; i++;
848 static void
849 unroll_loop_runtime_iterations (struct loop *loop)
851 rtx old_niter, niter, tmp;
852 rtx_insn *init_code, *branch_code;
853 unsigned i, j, p;
854 basic_block preheader, *body, swtch, ezc_swtch;
855 sbitmap wont_exit;
856 int may_exit_copy;
857 unsigned n_peel;
858 edge e;
859 bool extra_zero_check, last_may_exit;
860 unsigned max_unroll = loop->lpt_decision.times;
861 struct niter_desc *desc = get_simple_loop_desc (loop);
862 bool exit_at_end = loop_exit_at_end_p (loop);
863 struct opt_info *opt_info = NULL;
864 bool ok;
866 if (flag_split_ivs_in_unroller
867 || flag_variable_expansion_in_unroller)
868 opt_info = analyze_insns_in_loop (loop);
870 /* Remember blocks whose dominators will have to be updated. */
871 auto_vec<basic_block> dom_bbs;
873 body = get_loop_body (loop);
874 for (i = 0; i < loop->num_nodes; i++)
876 vec<basic_block> ldom;
877 basic_block bb;
879 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
880 FOR_EACH_VEC_ELT (ldom, j, bb)
881 if (!flow_bb_inside_loop_p (loop, bb))
882 dom_bbs.safe_push (bb);
884 ldom.release ();
886 free (body);
888 if (!exit_at_end)
890 /* Leave exit in first copy (for explanation why see comment in
891 unroll_loop_constant_iterations). */
892 may_exit_copy = 0;
893 n_peel = max_unroll - 1;
894 extra_zero_check = true;
895 last_may_exit = false;
897 else
899 /* Leave exit in last copy (for explanation why see comment in
900 unroll_loop_constant_iterations). */
901 may_exit_copy = max_unroll;
902 n_peel = max_unroll;
903 extra_zero_check = false;
904 last_may_exit = true;
907 /* Get expression for number of iterations. */
908 start_sequence ();
909 old_niter = niter = gen_reg_rtx (desc->mode);
910 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
911 if (tmp != niter)
912 emit_move_insn (niter, tmp);
914 /* Count modulo by ANDing it with max_unroll; we use the fact that
915 the number of unrollings is a power of two, and thus this is correct
916 even if there is overflow in the computation. */
917 niter = expand_simple_binop (desc->mode, AND,
918 niter, gen_int_mode (max_unroll, desc->mode),
919 NULL_RTX, 0, OPTAB_LIB_WIDEN);
921 init_code = get_insns ();
922 end_sequence ();
923 unshare_all_rtl_in_chain (init_code);
925 /* Precondition the loop. */
926 split_edge_and_insert (loop_preheader_edge (loop), init_code);
928 auto_vec<edge> remove_edges;
930 wont_exit = sbitmap_alloc (max_unroll + 2);
932 /* Peel the first copy of loop body (almost always we must leave exit test
933 here; the only exception is when we have extra zero check and the number
934 of iterations is reliable. Also record the place of (possible) extra
935 zero check. */
936 bitmap_clear (wont_exit);
937 if (extra_zero_check
938 && !desc->noloop_assumptions)
939 bitmap_set_bit (wont_exit, 1);
940 ezc_swtch = loop_preheader_edge (loop)->src;
941 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
942 1, wont_exit, desc->out_edge,
943 &remove_edges,
944 DLTHE_FLAG_UPDATE_FREQ);
945 gcc_assert (ok);
947 /* Record the place where switch will be built for preconditioning. */
948 swtch = split_edge (loop_preheader_edge (loop));
950 for (i = 0; i < n_peel; i++)
952 /* Peel the copy. */
953 bitmap_clear (wont_exit);
954 if (i != n_peel - 1 || !last_may_exit)
955 bitmap_set_bit (wont_exit, 1);
956 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
957 1, wont_exit, desc->out_edge,
958 &remove_edges,
959 DLTHE_FLAG_UPDATE_FREQ);
960 gcc_assert (ok);
962 /* Create item for switch. */
963 j = n_peel - i - (extra_zero_check ? 0 : 1);
964 p = REG_BR_PROB_BASE / (i + 2);
966 preheader = split_edge (loop_preheader_edge (loop));
967 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
968 block_label (preheader), p,
969 NULL);
971 /* We rely on the fact that the compare and jump cannot be optimized out,
972 and hence the cfg we create is correct. */
973 gcc_assert (branch_code != NULL_RTX);
975 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
976 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
977 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
978 e = make_edge (swtch, preheader,
979 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
980 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
981 e->probability = p;
984 if (extra_zero_check)
986 /* Add branch for zero iterations. */
987 p = REG_BR_PROB_BASE / (max_unroll + 1);
988 swtch = ezc_swtch;
989 preheader = split_edge (loop_preheader_edge (loop));
990 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
991 block_label (preheader), p,
992 NULL);
993 gcc_assert (branch_code != NULL_RTX);
995 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
996 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
997 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
998 e = make_edge (swtch, preheader,
999 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1000 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1001 e->probability = p;
1004 /* Recount dominators for outer blocks. */
1005 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1007 /* And unroll loop. */
1009 bitmap_ones (wont_exit);
1010 bitmap_clear_bit (wont_exit, may_exit_copy);
1011 opt_info_start_duplication (opt_info);
1013 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1014 max_unroll,
1015 wont_exit, desc->out_edge,
1016 &remove_edges,
1017 DLTHE_FLAG_UPDATE_FREQ
1018 | (opt_info
1019 ? DLTHE_RECORD_COPY_NUMBER
1020 : 0));
1021 gcc_assert (ok);
1023 if (opt_info)
1025 apply_opt_in_copies (opt_info, max_unroll, true, true);
1026 free_opt_info (opt_info);
1029 free (wont_exit);
1031 if (exit_at_end)
1033 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1034 /* Find a new in and out edge; they are in the last copy we have
1035 made. */
1037 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1039 desc->out_edge = EDGE_SUCC (exit_block, 0);
1040 desc->in_edge = EDGE_SUCC (exit_block, 1);
1042 else
1044 desc->out_edge = EDGE_SUCC (exit_block, 1);
1045 desc->in_edge = EDGE_SUCC (exit_block, 0);
1049 /* Remove the edges. */
1050 FOR_EACH_VEC_ELT (remove_edges, i, e)
1051 remove_path (e);
1053 /* We must be careful when updating the number of iterations due to
1054 preconditioning and the fact that the value must be valid at entry
1055 of the loop. After passing through the above code, we see that
1056 the correct new number of iterations is this: */
1057 gcc_assert (!desc->const_iter);
1058 desc->niter_expr =
1059 simplify_gen_binary (UDIV, desc->mode, old_niter,
1060 gen_int_mode (max_unroll + 1, desc->mode));
1061 loop->nb_iterations_upper_bound
1062 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1063 if (loop->any_estimate)
1064 loop->nb_iterations_estimate
1065 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1066 if (exit_at_end)
1068 desc->niter_expr =
1069 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1070 desc->noloop_assumptions = NULL_RTX;
1071 --loop->nb_iterations_upper_bound;
1072 if (loop->any_estimate
1073 && loop->nb_iterations_estimate != 0)
1074 --loop->nb_iterations_estimate;
1075 else
1076 loop->any_estimate = false;
1079 if (dump_file)
1080 fprintf (dump_file,
1081 ";; Unrolled loop %d times, counting # of iterations "
1082 "in runtime, %i insns\n",
1083 max_unroll, num_loop_insns (loop));
1086 /* Decide whether to unroll LOOP stupidly and how much. */
1087 static void
1088 decide_unroll_stupid (struct loop *loop, int flags)
1090 unsigned nunroll, nunroll_by_av, i;
1091 struct niter_desc *desc;
1092 widest_int iterations;
1094 if (!(flags & UAP_UNROLL_ALL))
1096 /* We were not asked to, just return back silently. */
1097 return;
1100 if (dump_file)
1101 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1103 /* nunroll = total number of copies of the original loop body in
1104 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1105 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1106 nunroll_by_av
1107 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1108 if (nunroll > nunroll_by_av)
1109 nunroll = nunroll_by_av;
1110 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1111 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1113 if (targetm.loop_unroll_adjust)
1114 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1116 /* Skip big loops. */
1117 if (nunroll <= 1)
1119 if (dump_file)
1120 fprintf (dump_file, ";; Not considering loop, is too big\n");
1121 return;
1124 /* Check for simple loops. */
1125 desc = get_simple_loop_desc (loop);
1127 /* Check simpleness. */
1128 if (desc->simple_p && !desc->assumptions)
1130 if (dump_file)
1131 fprintf (dump_file, ";; The loop is simple\n");
1132 return;
1135 /* Do not unroll loops with branches inside -- it increases number
1136 of mispredicts.
1137 TODO: this heuristic needs tunning; call inside the loop body
1138 is also relatively good reason to not unroll. */
1139 if (num_loop_branches (loop) > 1)
1141 if (dump_file)
1142 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1143 return;
1146 /* Check whether the loop rolls. */
1147 if ((get_estimated_loop_iterations (loop, &iterations)
1148 || get_max_loop_iterations (loop, &iterations))
1149 && wi::ltu_p (iterations, 2 * nunroll))
1151 if (dump_file)
1152 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1153 return;
1156 /* Success. Now force nunroll to be power of 2, as it seems that this
1157 improves results (partially because of better alignments, partially
1158 because of some dark magic). */
1159 for (i = 1; 2 * i <= nunroll; i *= 2)
1160 continue;
1162 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1163 loop->lpt_decision.times = i - 1;
1166 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1168 while (cond)
1169 body;
1171 ==> (LOOP->LPT_DECISION.TIMES == 3)
1173 while (cond)
1175 body;
1176 if (!cond) break;
1177 body;
1178 if (!cond) break;
1179 body;
1180 if (!cond) break;
1181 body;
1184 static void
1185 unroll_loop_stupid (struct loop *loop)
1187 sbitmap wont_exit;
1188 unsigned nunroll = loop->lpt_decision.times;
1189 struct niter_desc *desc = get_simple_loop_desc (loop);
1190 struct opt_info *opt_info = NULL;
1191 bool ok;
1193 if (flag_split_ivs_in_unroller
1194 || flag_variable_expansion_in_unroller)
1195 opt_info = analyze_insns_in_loop (loop);
1198 wont_exit = sbitmap_alloc (nunroll + 1);
1199 bitmap_clear (wont_exit);
1200 opt_info_start_duplication (opt_info);
1202 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1203 nunroll, wont_exit,
1204 NULL, NULL,
1205 DLTHE_FLAG_UPDATE_FREQ
1206 | (opt_info
1207 ? DLTHE_RECORD_COPY_NUMBER
1208 : 0));
1209 gcc_assert (ok);
1211 if (opt_info)
1213 apply_opt_in_copies (opt_info, nunroll, true, true);
1214 free_opt_info (opt_info);
1217 free (wont_exit);
1219 if (desc->simple_p)
1221 /* We indeed may get here provided that there are nontrivial assumptions
1222 for a loop to be really simple. We could update the counts, but the
1223 problem is that we are unable to decide which exit will be taken
1224 (not really true in case the number of iterations is constant,
1225 but no one will do anything with this information, so we do not
1226 worry about it). */
1227 desc->simple_p = false;
1230 if (dump_file)
1231 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1232 nunroll, num_loop_insns (loop));
1235 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1236 Set *DEBUG_USES to the number of debug insns that reference the
1237 variable. */
1239 static bool
1240 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1241 int *debug_uses)
1243 basic_block *body, bb;
1244 unsigned i;
1245 int count_ref = 0;
1246 rtx_insn *insn;
1248 body = get_loop_body (loop);
1249 for (i = 0; i < loop->num_nodes; i++)
1251 bb = body[i];
1253 FOR_BB_INSNS (bb, insn)
1254 if (!rtx_referenced_p (reg, insn))
1255 continue;
1256 else if (DEBUG_INSN_P (insn))
1257 ++*debug_uses;
1258 else if (++count_ref > 1)
1259 break;
1261 free (body);
1262 return (count_ref == 1);
1265 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1267 static void
1268 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1270 basic_block *body, bb;
1271 unsigned i;
1272 rtx_insn *insn;
1274 body = get_loop_body (loop);
1275 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1277 bb = body[i];
1279 FOR_BB_INSNS (bb, insn)
1280 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1281 continue;
1282 else
1284 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1285 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1286 if (!--debug_uses)
1287 break;
1290 free (body);
1293 /* Determine whether INSN contains an accumulator
1294 which can be expanded into separate copies,
1295 one for each copy of the LOOP body.
1297 for (i = 0 ; i < n; i++)
1298 sum += a[i];
1302 sum += a[i]
1303 ....
1304 i = i+1;
1305 sum1 += a[i]
1306 ....
1307 i = i+1
1308 sum2 += a[i];
1309 ....
1311 Return NULL if INSN contains no opportunity for expansion of accumulator.
1312 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1313 information and return a pointer to it.
1316 static struct var_to_expand *
1317 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1319 rtx set, dest, src;
1320 struct var_to_expand *ves;
1321 unsigned accum_pos;
1322 enum rtx_code code;
1323 int debug_uses = 0;
1325 set = single_set (insn);
1326 if (!set)
1327 return NULL;
1329 dest = SET_DEST (set);
1330 src = SET_SRC (set);
1331 code = GET_CODE (src);
1333 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1334 return NULL;
1336 if (FLOAT_MODE_P (GET_MODE (dest)))
1338 if (!flag_associative_math)
1339 return NULL;
1340 /* In the case of FMA, we're also changing the rounding. */
1341 if (code == FMA && !flag_unsafe_math_optimizations)
1342 return NULL;
1345 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1346 in MD. But if there is no optab to generate the insn, we can not
1347 perform the variable expansion. This can happen if an MD provides
1348 an insn but not a named pattern to generate it, for example to avoid
1349 producing code that needs additional mode switches like for x87/mmx.
1351 So we check have_insn_for which looks for an optab for the operation
1352 in SRC. If it doesn't exist, we can't perform the expansion even
1353 though INSN is valid. */
1354 if (!have_insn_for (code, GET_MODE (src)))
1355 return NULL;
1357 if (!REG_P (dest)
1358 && !(GET_CODE (dest) == SUBREG
1359 && REG_P (SUBREG_REG (dest))))
1360 return NULL;
1362 /* Find the accumulator use within the operation. */
1363 if (code == FMA)
1365 /* We only support accumulation via FMA in the ADD position. */
1366 if (!rtx_equal_p (dest, XEXP (src, 2)))
1367 return NULL;
1368 accum_pos = 2;
1370 else if (rtx_equal_p (dest, XEXP (src, 0)))
1371 accum_pos = 0;
1372 else if (rtx_equal_p (dest, XEXP (src, 1)))
1374 /* The method of expansion that we are using; which includes the
1375 initialization of the expansions with zero and the summation of
1376 the expansions at the end of the computation will yield wrong
1377 results for (x = something - x) thus avoid using it in that case. */
1378 if (code == MINUS)
1379 return NULL;
1380 accum_pos = 1;
1382 else
1383 return NULL;
1385 /* It must not otherwise be used. */
1386 if (code == FMA)
1388 if (rtx_referenced_p (dest, XEXP (src, 0))
1389 || rtx_referenced_p (dest, XEXP (src, 1)))
1390 return NULL;
1392 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1393 return NULL;
1395 /* It must be used in exactly one insn. */
1396 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1397 return NULL;
1399 if (dump_file)
1401 fprintf (dump_file, "\n;; Expanding Accumulator ");
1402 print_rtl (dump_file, dest);
1403 fprintf (dump_file, "\n");
1406 if (debug_uses)
1407 /* Instead of resetting the debug insns, we could replace each
1408 debug use in the loop with the sum or product of all expanded
1409 accummulators. Since we'll only know of all expansions at the
1410 end, we'd have to keep track of which vars_to_expand a debug
1411 insn in the loop references, take note of each copy of the
1412 debug insn during unrolling, and when it's all done, compute
1413 the sum or product of each variable and adjust the original
1414 debug insn and each copy thereof. What a pain! */
1415 reset_debug_uses_in_loop (loop, dest, debug_uses);
1417 /* Record the accumulator to expand. */
1418 ves = XNEW (struct var_to_expand);
1419 ves->insn = insn;
1420 ves->reg = copy_rtx (dest);
1421 ves->var_expansions.create (1);
1422 ves->next = NULL;
1423 ves->op = GET_CODE (src);
1424 ves->expansion_count = 0;
1425 ves->reuse_expansion = 0;
1426 return ves;
1429 /* Determine whether there is an induction variable in INSN that
1430 we would like to split during unrolling.
1432 I.e. replace
1434 i = i + 1;
1436 i = i + 1;
1438 i = i + 1;
1441 type chains by
1443 i0 = i + 1
1445 i = i0 + 1
1447 i = i0 + 2
1450 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1451 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1452 pointer to it. */
1454 static struct iv_to_split *
1455 analyze_iv_to_split_insn (rtx_insn *insn)
1457 rtx set, dest;
1458 struct rtx_iv iv;
1459 struct iv_to_split *ivts;
1460 bool ok;
1462 /* For now we just split the basic induction variables. Later this may be
1463 extended for example by selecting also addresses of memory references. */
1464 set = single_set (insn);
1465 if (!set)
1466 return NULL;
1468 dest = SET_DEST (set);
1469 if (!REG_P (dest))
1470 return NULL;
1472 if (!biv_p (insn, dest))
1473 return NULL;
1475 ok = iv_analyze_result (insn, dest, &iv);
1477 /* This used to be an assert under the assumption that if biv_p returns
1478 true that iv_analyze_result must also return true. However, that
1479 assumption is not strictly correct as evidenced by pr25569.
1481 Returning NULL when iv_analyze_result returns false is safe and
1482 avoids the problems in pr25569 until the iv_analyze_* routines
1483 can be fixed, which is apparently hard and time consuming
1484 according to their author. */
1485 if (! ok)
1486 return NULL;
1488 if (iv.step == const0_rtx
1489 || iv.mode != iv.extend_mode)
1490 return NULL;
1492 /* Record the insn to split. */
1493 ivts = XNEW (struct iv_to_split);
1494 ivts->insn = insn;
1495 ivts->orig_var = dest;
1496 ivts->base_var = NULL_RTX;
1497 ivts->step = iv.step;
1498 ivts->next = NULL;
1500 return ivts;
1503 /* Determines which of insns in LOOP can be optimized.
1504 Return a OPT_INFO struct with the relevant hash tables filled
1505 with all insns to be optimized. The FIRST_NEW_BLOCK field
1506 is undefined for the return value. */
1508 static struct opt_info *
1509 analyze_insns_in_loop (struct loop *loop)
1511 basic_block *body, bb;
1512 unsigned i;
1513 struct opt_info *opt_info = XCNEW (struct opt_info);
1514 rtx_insn *insn;
1515 struct iv_to_split *ivts = NULL;
1516 struct var_to_expand *ves = NULL;
1517 iv_to_split **slot1;
1518 var_to_expand **slot2;
1519 vec<edge> edges = get_loop_exit_edges (loop);
1520 edge exit;
1521 bool can_apply = false;
1523 iv_analysis_loop_init (loop);
1525 body = get_loop_body (loop);
1527 if (flag_split_ivs_in_unroller)
1529 opt_info->insns_to_split
1530 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1531 opt_info->iv_to_split_head = NULL;
1532 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1535 /* Record the loop exit bb and loop preheader before the unrolling. */
1536 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1538 if (edges.length () == 1)
1540 exit = edges[0];
1541 if (!(exit->flags & EDGE_COMPLEX))
1543 opt_info->loop_exit = split_edge (exit);
1544 can_apply = true;
1548 if (flag_variable_expansion_in_unroller
1549 && can_apply)
1551 opt_info->insns_with_var_to_expand
1552 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1553 opt_info->var_to_expand_head = NULL;
1554 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1557 for (i = 0; i < loop->num_nodes; i++)
1559 bb = body[i];
1560 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1561 continue;
1563 FOR_BB_INSNS (bb, insn)
1565 if (!INSN_P (insn))
1566 continue;
1568 if (opt_info->insns_to_split)
1569 ivts = analyze_iv_to_split_insn (insn);
1571 if (ivts)
1573 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1574 gcc_assert (*slot1 == NULL);
1575 *slot1 = ivts;
1576 *opt_info->iv_to_split_tail = ivts;
1577 opt_info->iv_to_split_tail = &ivts->next;
1578 continue;
1581 if (opt_info->insns_with_var_to_expand)
1582 ves = analyze_insn_to_expand_var (loop, insn);
1584 if (ves)
1586 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1587 gcc_assert (*slot2 == NULL);
1588 *slot2 = ves;
1589 *opt_info->var_to_expand_tail = ves;
1590 opt_info->var_to_expand_tail = &ves->next;
1595 edges.release ();
1596 free (body);
1597 return opt_info;
1600 /* Called just before loop duplication. Records start of duplicated area
1601 to OPT_INFO. */
1603 static void
1604 opt_info_start_duplication (struct opt_info *opt_info)
1606 if (opt_info)
1607 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1610 /* Determine the number of iterations between initialization of the base
1611 variable and the current copy (N_COPY). N_COPIES is the total number
1612 of newly created copies. UNROLLING is true if we are unrolling
1613 (not peeling) the loop. */
1615 static unsigned
1616 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1618 if (unrolling)
1620 /* If we are unrolling, initialization is done in the original loop
1621 body (number 0). */
1622 return n_copy;
1624 else
1626 /* If we are peeling, the copy in that the initialization occurs has
1627 number 1. The original loop (number 0) is the last. */
1628 if (n_copy)
1629 return n_copy - 1;
1630 else
1631 return n_copies;
1635 /* Allocate basic variable for the induction variable chain. */
1637 static void
1638 allocate_basic_variable (struct iv_to_split *ivts)
1640 rtx expr = SET_SRC (single_set (ivts->insn));
1642 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1645 /* Insert initialization of basic variable of IVTS before INSN, taking
1646 the initial value from INSN. */
1648 static void
1649 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1651 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1652 rtx_insn *seq;
1654 start_sequence ();
1655 expr = force_operand (expr, ivts->base_var);
1656 if (expr != ivts->base_var)
1657 emit_move_insn (ivts->base_var, expr);
1658 seq = get_insns ();
1659 end_sequence ();
1661 emit_insn_before (seq, insn);
1664 /* Replace the use of induction variable described in IVTS in INSN
1665 by base variable + DELTA * step. */
1667 static void
1668 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1670 rtx expr, *loc, incr, var;
1671 rtx_insn *seq;
1672 enum machine_mode mode = GET_MODE (ivts->base_var);
1673 rtx src, dest, set;
1675 /* Construct base + DELTA * step. */
1676 if (!delta)
1677 expr = ivts->base_var;
1678 else
1680 incr = simplify_gen_binary (MULT, mode,
1681 ivts->step, gen_int_mode (delta, mode));
1682 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1683 ivts->base_var, incr);
1686 /* Figure out where to do the replacement. */
1687 loc = &SET_SRC (single_set (insn));
1689 /* If we can make the replacement right away, we're done. */
1690 if (validate_change (insn, loc, expr, 0))
1691 return;
1693 /* Otherwise, force EXPR into a register and try again. */
1694 start_sequence ();
1695 var = gen_reg_rtx (mode);
1696 expr = force_operand (expr, var);
1697 if (expr != var)
1698 emit_move_insn (var, expr);
1699 seq = get_insns ();
1700 end_sequence ();
1701 emit_insn_before (seq, insn);
1703 if (validate_change (insn, loc, var, 0))
1704 return;
1706 /* The last chance. Try recreating the assignment in insn
1707 completely from scratch. */
1708 set = single_set (insn);
1709 gcc_assert (set);
1711 start_sequence ();
1712 *loc = var;
1713 src = copy_rtx (SET_SRC (set));
1714 dest = copy_rtx (SET_DEST (set));
1715 src = force_operand (src, dest);
1716 if (src != dest)
1717 emit_move_insn (dest, src);
1718 seq = get_insns ();
1719 end_sequence ();
1721 emit_insn_before (seq, insn);
1722 delete_insn (insn);
1726 /* Return one expansion of the accumulator recorded in struct VE. */
1728 static rtx
1729 get_expansion (struct var_to_expand *ve)
1731 rtx reg;
1733 if (ve->reuse_expansion == 0)
1734 reg = ve->reg;
1735 else
1736 reg = ve->var_expansions[ve->reuse_expansion - 1];
1738 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1739 ve->reuse_expansion = 0;
1740 else
1741 ve->reuse_expansion++;
1743 return reg;
1747 /* Given INSN replace the uses of the accumulator recorded in VE
1748 with a new register. */
1750 static void
1751 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1753 rtx new_reg, set;
1754 bool really_new_expansion = false;
1756 set = single_set (insn);
1757 gcc_assert (set);
1759 /* Generate a new register only if the expansion limit has not been
1760 reached. Else reuse an already existing expansion. */
1761 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1763 really_new_expansion = true;
1764 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1766 else
1767 new_reg = get_expansion (ve);
1769 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1770 if (apply_change_group ())
1771 if (really_new_expansion)
1773 ve->var_expansions.safe_push (new_reg);
1774 ve->expansion_count++;
1778 /* Initialize the variable expansions in loop preheader. PLACE is the
1779 loop-preheader basic block where the initialization of the
1780 expansions should take place. The expansions are initialized with
1781 (-0) when the operation is plus or minus to honor sign zero. This
1782 way we can prevent cases where the sign of the final result is
1783 effected by the sign of the expansion. Here is an example to
1784 demonstrate this:
1786 for (i = 0 ; i < n; i++)
1787 sum += something;
1791 sum += something
1792 ....
1793 i = i+1;
1794 sum1 += something
1795 ....
1796 i = i+1
1797 sum2 += something;
1798 ....
1800 When SUM is initialized with -zero and SOMETHING is also -zero; the
1801 final result of sum should be -zero thus the expansions sum1 and sum2
1802 should be initialized with -zero as well (otherwise we will get +zero
1803 as the final result). */
1805 static void
1806 insert_var_expansion_initialization (struct var_to_expand *ve,
1807 basic_block place)
1809 rtx_insn *seq;
1810 rtx var, zero_init;
1811 unsigned i;
1812 enum machine_mode mode = GET_MODE (ve->reg);
1813 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1815 if (ve->var_expansions.length () == 0)
1816 return;
1818 start_sequence ();
1819 switch (ve->op)
1821 case FMA:
1822 /* Note that we only accumulate FMA via the ADD operand. */
1823 case PLUS:
1824 case MINUS:
1825 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1827 if (honor_signed_zero_p)
1828 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1829 else
1830 zero_init = CONST0_RTX (mode);
1831 emit_move_insn (var, zero_init);
1833 break;
1835 case MULT:
1836 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1838 zero_init = CONST1_RTX (GET_MODE (var));
1839 emit_move_insn (var, zero_init);
1841 break;
1843 default:
1844 gcc_unreachable ();
1847 seq = get_insns ();
1848 end_sequence ();
1850 emit_insn_after (seq, BB_END (place));
1853 /* Combine the variable expansions at the loop exit. PLACE is the
1854 loop exit basic block where the summation of the expansions should
1855 take place. */
1857 static void
1858 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1860 rtx sum = ve->reg;
1861 rtx expr, var;
1862 rtx_insn *seq, *insn;
1863 unsigned i;
1865 if (ve->var_expansions.length () == 0)
1866 return;
1868 start_sequence ();
1869 switch (ve->op)
1871 case FMA:
1872 /* Note that we only accumulate FMA via the ADD operand. */
1873 case PLUS:
1874 case MINUS:
1875 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1876 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1877 break;
1879 case MULT:
1880 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1881 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1882 break;
1884 default:
1885 gcc_unreachable ();
1888 expr = force_operand (sum, ve->reg);
1889 if (expr != ve->reg)
1890 emit_move_insn (ve->reg, expr);
1891 seq = get_insns ();
1892 end_sequence ();
1894 insn = BB_HEAD (place);
1895 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1896 insn = NEXT_INSN (insn);
1898 emit_insn_after (seq, insn);
1901 /* Strip away REG_EQUAL notes for IVs we're splitting.
1903 Updating REG_EQUAL notes for IVs we split is tricky: We
1904 cannot tell until after unrolling, DF-rescanning, and liveness
1905 updating, whether an EQ_USE is reached by the split IV while
1906 the IV reg is still live. See PR55006.
1908 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1909 because RTL loop-iv requires us to defer rescanning insns and
1910 any notes attached to them. So resort to old techniques... */
1912 static void
1913 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1915 struct iv_to_split *ivts;
1916 rtx note = find_reg_equal_equiv_note (insn);
1917 if (! note)
1918 return;
1919 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1920 if (reg_mentioned_p (ivts->orig_var, note))
1922 remove_note (insn, note);
1923 return;
1927 /* Apply loop optimizations in loop copies using the
1928 data which gathered during the unrolling. Structure
1929 OPT_INFO record that data.
1931 UNROLLING is true if we unrolled (not peeled) the loop.
1932 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1933 the loop (as it should happen in complete unrolling, but not in ordinary
1934 peeling of the loop). */
1936 static void
1937 apply_opt_in_copies (struct opt_info *opt_info,
1938 unsigned n_copies, bool unrolling,
1939 bool rewrite_original_loop)
1941 unsigned i, delta;
1942 basic_block bb, orig_bb;
1943 rtx_insn *insn, *orig_insn, *next;
1944 struct iv_to_split ivts_templ, *ivts;
1945 struct var_to_expand ve_templ, *ves;
1947 /* Sanity check -- we need to put initialization in the original loop
1948 body. */
1949 gcc_assert (!unrolling || rewrite_original_loop);
1951 /* Allocate the basic variables (i0). */
1952 if (opt_info->insns_to_split)
1953 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1954 allocate_basic_variable (ivts);
1956 for (i = opt_info->first_new_block;
1957 i < (unsigned) last_basic_block_for_fn (cfun);
1958 i++)
1960 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1961 orig_bb = get_bb_original (bb);
1963 /* bb->aux holds position in copy sequence initialized by
1964 duplicate_loop_to_header_edge. */
1965 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1966 unrolling);
1967 bb->aux = 0;
1968 orig_insn = BB_HEAD (orig_bb);
1969 FOR_BB_INSNS_SAFE (bb, insn, next)
1971 if (!INSN_P (insn)
1972 || (DEBUG_INSN_P (insn)
1973 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1974 continue;
1976 while (!INSN_P (orig_insn)
1977 || (DEBUG_INSN_P (orig_insn)
1978 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1979 == LABEL_DECL)))
1980 orig_insn = NEXT_INSN (orig_insn);
1982 ivts_templ.insn = orig_insn;
1983 ve_templ.insn = orig_insn;
1985 /* Apply splitting iv optimization. */
1986 if (opt_info->insns_to_split)
1988 maybe_strip_eq_note_for_split_iv (opt_info, insn);
1990 ivts = opt_info->insns_to_split->find (&ivts_templ);
1992 if (ivts)
1994 gcc_assert (GET_CODE (PATTERN (insn))
1995 == GET_CODE (PATTERN (orig_insn)));
1997 if (!delta)
1998 insert_base_initialization (ivts, insn);
1999 split_iv (ivts, insn, delta);
2002 /* Apply variable expansion optimization. */
2003 if (unrolling && opt_info->insns_with_var_to_expand)
2005 ves = (struct var_to_expand *)
2006 opt_info->insns_with_var_to_expand->find (&ve_templ);
2007 if (ves)
2009 gcc_assert (GET_CODE (PATTERN (insn))
2010 == GET_CODE (PATTERN (orig_insn)));
2011 expand_var_during_unrolling (ves, insn);
2014 orig_insn = NEXT_INSN (orig_insn);
2018 if (!rewrite_original_loop)
2019 return;
2021 /* Initialize the variable expansions in the loop preheader
2022 and take care of combining them at the loop exit. */
2023 if (opt_info->insns_with_var_to_expand)
2025 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2026 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2027 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2028 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2031 /* Rewrite also the original loop body. Find them as originals of the blocks
2032 in the last copied iteration, i.e. those that have
2033 get_bb_copy (get_bb_original (bb)) == bb. */
2034 for (i = opt_info->first_new_block;
2035 i < (unsigned) last_basic_block_for_fn (cfun);
2036 i++)
2038 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2039 orig_bb = get_bb_original (bb);
2040 if (get_bb_copy (orig_bb) != bb)
2041 continue;
2043 delta = determine_split_iv_delta (0, n_copies, unrolling);
2044 for (orig_insn = BB_HEAD (orig_bb);
2045 orig_insn != NEXT_INSN (BB_END (bb));
2046 orig_insn = next)
2048 next = NEXT_INSN (orig_insn);
2050 if (!INSN_P (orig_insn))
2051 continue;
2053 ivts_templ.insn = orig_insn;
2054 if (opt_info->insns_to_split)
2056 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2058 ivts = (struct iv_to_split *)
2059 opt_info->insns_to_split->find (&ivts_templ);
2060 if (ivts)
2062 if (!delta)
2063 insert_base_initialization (ivts, orig_insn);
2064 split_iv (ivts, orig_insn, delta);
2065 continue;
2073 /* Release OPT_INFO. */
2075 static void
2076 free_opt_info (struct opt_info *opt_info)
2078 delete opt_info->insns_to_split;
2079 opt_info->insns_to_split = NULL;
2080 if (opt_info->insns_with_var_to_expand)
2082 struct var_to_expand *ves;
2084 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2085 ves->var_expansions.release ();
2086 delete opt_info->insns_with_var_to_expand;
2087 opt_info->insns_with_var_to_expand = NULL;
2089 free (opt_info);