Merge trunk version 222452 into gupc branch.
[official-gcc.git] / gcc / loop-unroll.c
blobccf473d4a54a897679849803ab26cef1e67c2efc
1 /* Loop unrolling.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "hash-set.h"
26 #include "machmode.h"
27 #include "vec.h"
28 #include "double-int.h"
29 #include "input.h"
30 #include "alias.h"
31 #include "symtab.h"
32 #include "wide-int.h"
33 #include "inchash.h"
34 #include "tree.h"
35 #include "hard-reg-set.h"
36 #include "obstack.h"
37 #include "profile.h"
38 #include "predict.h"
39 #include "function.h"
40 #include "dominance.h"
41 #include "cfg.h"
42 #include "cfgrtl.h"
43 #include "basic-block.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "insn-codes.h"
47 #include "optabs.h"
48 #include "hashtab.h"
49 #include "flags.h"
50 #include "statistics.h"
51 #include "real.h"
52 #include "fixed-value.h"
53 #include "insn-config.h"
54 #include "expmed.h"
55 #include "dojump.h"
56 #include "explow.h"
57 #include "calls.h"
58 #include "emit-rtl.h"
59 #include "varasm.h"
60 #include "stmt.h"
61 #include "expr.h"
62 #include "hash-table.h"
63 #include "recog.h"
64 #include "target.h"
65 #include "dumpfile.h"
67 /* This pass performs loop unrolling. We only perform this
68 optimization on innermost loops (with single exception) because
69 the impact on performance is greatest here, and we want to avoid
70 unnecessary code size growth. The gain is caused by greater sequentiality
71 of code, better code to optimize for further passes and in some cases
72 by fewer testings of exit conditions. The main problem is code growth,
73 that impacts performance negatively due to effect of caches.
75 What we do:
77 -- unrolling of loops that roll constant times; this is almost always
78 win, as we get rid of exit condition tests.
79 -- unrolling of loops that roll number of times that we can compute
80 in runtime; we also get rid of exit condition tests here, but there
81 is the extra expense for calculating the number of iterations
82 -- simple unrolling of remaining loops; this is performed only if we
83 are asked to, as the gain is questionable in this case and often
84 it may even slow down the code
85 For more detailed descriptions of each of those, see comments at
86 appropriate function below.
88 There is a lot of parameters (defined and described in params.def) that
89 control how much we unroll.
91 ??? A great problem is that we don't have a good way how to determine
92 how many times we should unroll the loop; the experiments I have made
93 showed that this choice may affect performance in order of several %.
96 /* Information about induction variables to split. */
98 struct iv_to_split
100 rtx_insn *insn; /* The insn in that the induction variable occurs. */
101 rtx orig_var; /* The variable (register) for the IV before split. */
102 rtx base_var; /* The variable on that the values in the further
103 iterations are based. */
104 rtx step; /* Step of the induction variable. */
105 struct iv_to_split *next; /* Next entry in walking order. */
108 /* Information about accumulators to expand. */
110 struct var_to_expand
112 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
113 rtx reg; /* The accumulator which is expanded. */
114 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
115 struct var_to_expand *next; /* Next entry in walking order. */
116 enum rtx_code op; /* The type of the accumulation - addition, subtraction
117 or multiplication. */
118 int expansion_count; /* Count the number of expansions generated so far. */
119 int reuse_expansion; /* The expansion we intend to reuse to expand
120 the accumulator. If REUSE_EXPANSION is 0 reuse
121 the original accumulator. Else use
122 var_expansions[REUSE_EXPANSION - 1]. */
125 /* Hashtable helper for iv_to_split. */
127 struct iv_split_hasher : typed_free_remove <iv_to_split>
129 typedef iv_to_split *value_type;
130 typedef iv_to_split *compare_type;
131 static inline hashval_t hash (const iv_to_split *);
132 static inline bool equal (const iv_to_split *, const iv_to_split *);
136 /* A hash function for information about insns to split. */
138 inline hashval_t
139 iv_split_hasher::hash (const iv_to_split *ivts)
141 return (hashval_t) INSN_UID (ivts->insn);
144 /* An equality functions for information about insns to split. */
146 inline bool
147 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
149 return i1->insn == i2->insn;
152 /* Hashtable helper for iv_to_split. */
154 struct var_expand_hasher : typed_free_remove <var_to_expand>
156 typedef var_to_expand *value_type;
157 typedef var_to_expand *compare_type;
158 static inline hashval_t hash (const var_to_expand *);
159 static inline bool equal (const var_to_expand *, const var_to_expand *);
162 /* Return a hash for VES. */
164 inline hashval_t
165 var_expand_hasher::hash (const var_to_expand *ves)
167 return (hashval_t) INSN_UID (ves->insn);
170 /* Return true if I1 and I2 refer to the same instruction. */
172 inline bool
173 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
175 return i1->insn == i2->insn;
178 /* Information about optimization applied in
179 the unrolled loop. */
181 struct opt_info
183 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
184 split. */
185 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
186 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
187 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
188 insns with accumulators to expand. */
189 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
190 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
191 unsigned first_new_block; /* The first basic block that was
192 duplicated. */
193 basic_block loop_exit; /* The loop exit basic block. */
194 basic_block loop_preheader; /* The loop preheader basic block. */
197 static void decide_unroll_stupid (struct loop *, int);
198 static void decide_unroll_constant_iterations (struct loop *, int);
199 static void decide_unroll_runtime_iterations (struct loop *, int);
200 static void unroll_loop_stupid (struct loop *);
201 static void decide_unrolling (int);
202 static void unroll_loop_constant_iterations (struct loop *);
203 static void unroll_loop_runtime_iterations (struct loop *);
204 static struct opt_info *analyze_insns_in_loop (struct loop *);
205 static void opt_info_start_duplication (struct opt_info *);
206 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
207 static void free_opt_info (struct opt_info *);
208 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
209 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
210 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
211 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
212 static void insert_var_expansion_initialization (struct var_to_expand *,
213 basic_block);
214 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
215 basic_block);
216 static rtx get_expansion (struct var_to_expand *);
218 /* Emit a message summarizing the unroll that will be
219 performed for LOOP, along with the loop's location LOCUS, if
220 appropriate given the dump or -fopt-info settings. */
222 static void
223 report_unroll (struct loop *loop, location_t locus)
225 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
227 if (loop->lpt_decision.decision == LPT_NONE)
228 return;
230 if (!dump_enabled_p ())
231 return;
233 dump_printf_loc (report_flags, locus,
234 "loop unrolled %d times",
235 loop->lpt_decision.times);
236 if (profile_info)
237 dump_printf (report_flags,
238 " (header execution count %d)",
239 (int)loop->header->count);
241 dump_printf (report_flags, "\n");
244 /* Decide whether unroll loops and how much. */
245 static void
246 decide_unrolling (int flags)
248 struct loop *loop;
250 /* Scan the loops, inner ones first. */
251 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
253 loop->lpt_decision.decision = LPT_NONE;
254 location_t locus = get_loop_location (loop);
256 if (dump_enabled_p ())
257 dump_printf_loc (TDF_RTL, locus,
258 ";; *** Considering loop %d at BB %d for "
259 "unrolling ***\n",
260 loop->num, loop->header->index);
262 /* Do not peel cold areas. */
263 if (optimize_loop_for_size_p (loop))
265 if (dump_file)
266 fprintf (dump_file, ";; Not considering loop, cold area\n");
267 continue;
270 /* Can the loop be manipulated? */
271 if (!can_duplicate_loop_p (loop))
273 if (dump_file)
274 fprintf (dump_file,
275 ";; Not considering loop, cannot duplicate\n");
276 continue;
279 /* Skip non-innermost loops. */
280 if (loop->inner)
282 if (dump_file)
283 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
284 continue;
287 loop->ninsns = num_loop_insns (loop);
288 loop->av_ninsns = average_num_loop_insns (loop);
290 /* Try transformations one by one in decreasing order of
291 priority. */
293 decide_unroll_constant_iterations (loop, flags);
294 if (loop->lpt_decision.decision == LPT_NONE)
295 decide_unroll_runtime_iterations (loop, flags);
296 if (loop->lpt_decision.decision == LPT_NONE)
297 decide_unroll_stupid (loop, flags);
299 report_unroll (loop, locus);
303 /* Unroll LOOPS. */
304 void
305 unroll_loops (int flags)
307 struct loop *loop;
308 bool changed = false;
310 /* Now decide rest of unrolling. */
311 decide_unrolling (flags);
313 /* Scan the loops, inner ones first. */
314 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
316 /* And perform the appropriate transformations. */
317 switch (loop->lpt_decision.decision)
319 case LPT_UNROLL_CONSTANT:
320 unroll_loop_constant_iterations (loop);
321 changed = true;
322 break;
323 case LPT_UNROLL_RUNTIME:
324 unroll_loop_runtime_iterations (loop);
325 changed = true;
326 break;
327 case LPT_UNROLL_STUPID:
328 unroll_loop_stupid (loop);
329 changed = true;
330 break;
331 case LPT_NONE:
332 break;
333 default:
334 gcc_unreachable ();
338 if (changed)
340 calculate_dominance_info (CDI_DOMINATORS);
341 fix_loop_structure (NULL);
344 iv_analysis_done ();
347 /* Check whether exit of the LOOP is at the end of loop body. */
349 static bool
350 loop_exit_at_end_p (struct loop *loop)
352 struct niter_desc *desc = get_simple_loop_desc (loop);
353 rtx_insn *insn;
355 /* We should never have conditional in latch block. */
356 gcc_assert (desc->in_edge->dest != loop->header);
358 if (desc->in_edge->dest != loop->latch)
359 return false;
361 /* Check that the latch is empty. */
362 FOR_BB_INSNS (loop->latch, insn)
364 if (INSN_P (insn) && active_insn_p (insn))
365 return false;
368 return true;
371 /* Decide whether to unroll LOOP iterating constant number of times
372 and how much. */
374 static void
375 decide_unroll_constant_iterations (struct loop *loop, int flags)
377 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
378 struct niter_desc *desc;
379 widest_int iterations;
381 if (!(flags & UAP_UNROLL))
383 /* We were not asked to, just return back silently. */
384 return;
387 if (dump_file)
388 fprintf (dump_file,
389 "\n;; Considering unrolling loop with constant "
390 "number of iterations\n");
392 /* nunroll = total number of copies of the original loop body in
393 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
394 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
395 nunroll_by_av
396 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
397 if (nunroll > nunroll_by_av)
398 nunroll = nunroll_by_av;
399 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
400 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
402 if (targetm.loop_unroll_adjust)
403 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
405 /* Skip big loops. */
406 if (nunroll <= 1)
408 if (dump_file)
409 fprintf (dump_file, ";; Not considering loop, is too big\n");
410 return;
413 /* Check for simple loops. */
414 desc = get_simple_loop_desc (loop);
416 /* Check number of iterations. */
417 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
419 if (dump_file)
420 fprintf (dump_file,
421 ";; Unable to prove that the loop iterates constant times\n");
422 return;
425 /* Check whether the loop rolls enough to consider.
426 Consult also loop bounds and profile; in the case the loop has more
427 than one exit it may well loop less than determined maximal number
428 of iterations. */
429 if (desc->niter < 2 * nunroll
430 || ((get_estimated_loop_iterations (loop, &iterations)
431 || get_max_loop_iterations (loop, &iterations))
432 && wi::ltu_p (iterations, 2 * nunroll)))
434 if (dump_file)
435 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
436 return;
439 /* Success; now compute number of iterations to unroll. We alter
440 nunroll so that as few as possible copies of loop body are
441 necessary, while still not decreasing the number of unrollings
442 too much (at most by 1). */
443 best_copies = 2 * nunroll + 10;
445 i = 2 * nunroll + 2;
446 if (i - 1 >= desc->niter)
447 i = desc->niter - 2;
449 for (; i >= nunroll - 1; i--)
451 unsigned exit_mod = desc->niter % (i + 1);
453 if (!loop_exit_at_end_p (loop))
454 n_copies = exit_mod + i + 1;
455 else if (exit_mod != (unsigned) i
456 || desc->noloop_assumptions != NULL_RTX)
457 n_copies = exit_mod + i + 2;
458 else
459 n_copies = i + 1;
461 if (n_copies < best_copies)
463 best_copies = n_copies;
464 best_unroll = i;
468 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
469 loop->lpt_decision.times = best_unroll;
472 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
473 The transformation does this:
475 for (i = 0; i < 102; i++)
476 body;
478 ==> (LOOP->LPT_DECISION.TIMES == 3)
480 i = 0;
481 body; i++;
482 body; i++;
483 while (i < 102)
485 body; i++;
486 body; i++;
487 body; i++;
488 body; i++;
491 static void
492 unroll_loop_constant_iterations (struct loop *loop)
494 unsigned HOST_WIDE_INT niter;
495 unsigned exit_mod;
496 sbitmap wont_exit;
497 unsigned i;
498 edge e;
499 unsigned max_unroll = loop->lpt_decision.times;
500 struct niter_desc *desc = get_simple_loop_desc (loop);
501 bool exit_at_end = loop_exit_at_end_p (loop);
502 struct opt_info *opt_info = NULL;
503 bool ok;
505 niter = desc->niter;
507 /* Should not get here (such loop should be peeled instead). */
508 gcc_assert (niter > max_unroll + 1);
510 exit_mod = niter % (max_unroll + 1);
512 wont_exit = sbitmap_alloc (max_unroll + 1);
513 bitmap_ones (wont_exit);
515 auto_vec<edge> remove_edges;
516 if (flag_split_ivs_in_unroller
517 || flag_variable_expansion_in_unroller)
518 opt_info = analyze_insns_in_loop (loop);
520 if (!exit_at_end)
522 /* The exit is not at the end of the loop; leave exit test
523 in the first copy, so that the loops that start with test
524 of exit condition have continuous body after unrolling. */
526 if (dump_file)
527 fprintf (dump_file, ";; Condition at beginning of loop.\n");
529 /* Peel exit_mod iterations. */
530 bitmap_clear_bit (wont_exit, 0);
531 if (desc->noloop_assumptions)
532 bitmap_clear_bit (wont_exit, 1);
534 if (exit_mod)
536 opt_info_start_duplication (opt_info);
537 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
538 exit_mod,
539 wont_exit, desc->out_edge,
540 &remove_edges,
541 DLTHE_FLAG_UPDATE_FREQ
542 | (opt_info && exit_mod > 1
543 ? DLTHE_RECORD_COPY_NUMBER
544 : 0));
545 gcc_assert (ok);
547 if (opt_info && exit_mod > 1)
548 apply_opt_in_copies (opt_info, exit_mod, false, false);
550 desc->noloop_assumptions = NULL_RTX;
551 desc->niter -= exit_mod;
552 loop->nb_iterations_upper_bound -= exit_mod;
553 if (loop->any_estimate
554 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
555 loop->nb_iterations_estimate -= exit_mod;
556 else
557 loop->any_estimate = false;
560 bitmap_set_bit (wont_exit, 1);
562 else
564 /* Leave exit test in last copy, for the same reason as above if
565 the loop tests the condition at the end of loop body. */
567 if (dump_file)
568 fprintf (dump_file, ";; Condition at end of loop.\n");
570 /* We know that niter >= max_unroll + 2; so we do not need to care of
571 case when we would exit before reaching the loop. So just peel
572 exit_mod + 1 iterations. */
573 if (exit_mod != max_unroll
574 || desc->noloop_assumptions)
576 bitmap_clear_bit (wont_exit, 0);
577 if (desc->noloop_assumptions)
578 bitmap_clear_bit (wont_exit, 1);
580 opt_info_start_duplication (opt_info);
581 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
582 exit_mod + 1,
583 wont_exit, desc->out_edge,
584 &remove_edges,
585 DLTHE_FLAG_UPDATE_FREQ
586 | (opt_info && exit_mod > 0
587 ? DLTHE_RECORD_COPY_NUMBER
588 : 0));
589 gcc_assert (ok);
591 if (opt_info && exit_mod > 0)
592 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
594 desc->niter -= exit_mod + 1;
595 loop->nb_iterations_upper_bound -= exit_mod + 1;
596 if (loop->any_estimate
597 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
598 loop->nb_iterations_estimate -= exit_mod + 1;
599 else
600 loop->any_estimate = false;
601 desc->noloop_assumptions = NULL_RTX;
603 bitmap_set_bit (wont_exit, 0);
604 bitmap_set_bit (wont_exit, 1);
607 bitmap_clear_bit (wont_exit, max_unroll);
610 /* Now unroll the loop. */
612 opt_info_start_duplication (opt_info);
613 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
614 max_unroll,
615 wont_exit, desc->out_edge,
616 &remove_edges,
617 DLTHE_FLAG_UPDATE_FREQ
618 | (opt_info
619 ? DLTHE_RECORD_COPY_NUMBER
620 : 0));
621 gcc_assert (ok);
623 if (opt_info)
625 apply_opt_in_copies (opt_info, max_unroll, true, true);
626 free_opt_info (opt_info);
629 free (wont_exit);
631 if (exit_at_end)
633 basic_block exit_block = get_bb_copy (desc->in_edge->src);
634 /* Find a new in and out edge; they are in the last copy we have made. */
636 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
638 desc->out_edge = EDGE_SUCC (exit_block, 0);
639 desc->in_edge = EDGE_SUCC (exit_block, 1);
641 else
643 desc->out_edge = EDGE_SUCC (exit_block, 1);
644 desc->in_edge = EDGE_SUCC (exit_block, 0);
648 desc->niter /= max_unroll + 1;
649 loop->nb_iterations_upper_bound
650 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
651 if (loop->any_estimate)
652 loop->nb_iterations_estimate
653 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
654 desc->niter_expr = GEN_INT (desc->niter);
656 /* Remove the edges. */
657 FOR_EACH_VEC_ELT (remove_edges, i, e)
658 remove_path (e);
660 if (dump_file)
661 fprintf (dump_file,
662 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
663 max_unroll, num_loop_insns (loop));
666 /* Decide whether to unroll LOOP iterating runtime computable number of times
667 and how much. */
668 static void
669 decide_unroll_runtime_iterations (struct loop *loop, int flags)
671 unsigned nunroll, nunroll_by_av, i;
672 struct niter_desc *desc;
673 widest_int iterations;
675 if (!(flags & UAP_UNROLL))
677 /* We were not asked to, just return back silently. */
678 return;
681 if (dump_file)
682 fprintf (dump_file,
683 "\n;; Considering unrolling loop with runtime "
684 "computable number of iterations\n");
686 /* nunroll = total number of copies of the original loop body in
687 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
688 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
689 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
690 if (nunroll > nunroll_by_av)
691 nunroll = nunroll_by_av;
692 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
693 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
695 if (targetm.loop_unroll_adjust)
696 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
698 /* Skip big loops. */
699 if (nunroll <= 1)
701 if (dump_file)
702 fprintf (dump_file, ";; Not considering loop, is too big\n");
703 return;
706 /* Check for simple loops. */
707 desc = get_simple_loop_desc (loop);
709 /* Check simpleness. */
710 if (!desc->simple_p || desc->assumptions)
712 if (dump_file)
713 fprintf (dump_file,
714 ";; Unable to prove that the number of iterations "
715 "can be counted in runtime\n");
716 return;
719 if (desc->const_iter)
721 if (dump_file)
722 fprintf (dump_file, ";; Loop iterates constant times\n");
723 return;
726 /* Check whether the loop rolls. */
727 if ((get_estimated_loop_iterations (loop, &iterations)
728 || get_max_loop_iterations (loop, &iterations))
729 && wi::ltu_p (iterations, 2 * nunroll))
731 if (dump_file)
732 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
733 return;
736 /* Success; now force nunroll to be power of 2, as we are unable to
737 cope with overflows in computation of number of iterations. */
738 for (i = 1; 2 * i <= nunroll; i *= 2)
739 continue;
741 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
742 loop->lpt_decision.times = i - 1;
745 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
746 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
747 and NULL is returned instead. */
749 basic_block
750 split_edge_and_insert (edge e, rtx_insn *insns)
752 basic_block bb;
754 if (!insns)
755 return NULL;
756 bb = split_edge (e);
757 emit_insn_after (insns, BB_END (bb));
759 /* ??? We used to assume that INSNS can contain control flow insns, and
760 that we had to try to find sub basic blocks in BB to maintain a valid
761 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
762 and call break_superblocks when going out of cfglayout mode. But it
763 turns out that this never happens; and that if it does ever happen,
764 the verify_flow_info at the end of the RTL loop passes would fail.
766 There are two reasons why we expected we could have control flow insns
767 in INSNS. The first is when a comparison has to be done in parts, and
768 the second is when the number of iterations is computed for loops with
769 the number of iterations known at runtime. In both cases, test cases
770 to get control flow in INSNS appear to be impossible to construct:
772 * If do_compare_rtx_and_jump needs several branches to do comparison
773 in a mode that needs comparison by parts, we cannot analyze the
774 number of iterations of the loop, and we never get to unrolling it.
776 * The code in expand_divmod that was suspected to cause creation of
777 branching code seems to be only accessed for signed division. The
778 divisions used by # of iterations analysis are always unsigned.
779 Problems might arise on architectures that emits branching code
780 for some operations that may appear in the unroller (especially
781 for division), but we have no such architectures.
783 Considering all this, it was decided that we should for now assume
784 that INSNS can in theory contain control flow insns, but in practice
785 it never does. So we don't handle the theoretical case, and should
786 a real failure ever show up, we have a pretty good clue for how to
787 fix it. */
789 return bb;
792 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
793 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
794 in order to create a jump. */
796 static rtx_insn *
797 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
798 rtx_insn *cinsn)
800 rtx_insn *seq, *jump;
801 rtx cond;
802 machine_mode mode;
804 mode = GET_MODE (op0);
805 if (mode == VOIDmode)
806 mode = GET_MODE (op1);
808 start_sequence ();
809 if (GET_MODE_CLASS (mode) == MODE_CC)
811 /* A hack -- there seems to be no easy generic way how to make a
812 conditional jump from a ccmode comparison. */
813 gcc_assert (cinsn);
814 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
815 gcc_assert (GET_CODE (cond) == comp);
816 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
817 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
818 emit_jump_insn (copy_insn (PATTERN (cinsn)));
819 jump = get_last_insn ();
820 gcc_assert (JUMP_P (jump));
821 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
822 LABEL_NUSES (JUMP_LABEL (jump))++;
823 redirect_jump (jump, label, 0);
825 else
827 gcc_assert (!cinsn);
829 op0 = force_operand (op0, NULL_RTX);
830 op1 = force_operand (op1, NULL_RTX);
831 do_compare_rtx_and_jump (op0, op1, comp, 0,
832 mode, NULL_RTX, NULL_RTX, label, -1);
833 jump = get_last_insn ();
834 gcc_assert (JUMP_P (jump));
835 JUMP_LABEL (jump) = label;
836 LABEL_NUSES (label)++;
838 add_int_reg_note (jump, REG_BR_PROB, prob);
840 seq = get_insns ();
841 end_sequence ();
843 return seq;
846 /* Unroll LOOP for which we are able to count number of iterations in runtime
847 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
848 extra care for case n < 0):
850 for (i = 0; i < n; i++)
851 body;
853 ==> (LOOP->LPT_DECISION.TIMES == 3)
855 i = 0;
856 mod = n % 4;
858 switch (mod)
860 case 3:
861 body; i++;
862 case 2:
863 body; i++;
864 case 1:
865 body; i++;
866 case 0: ;
869 while (i < n)
871 body; i++;
872 body; i++;
873 body; i++;
874 body; i++;
877 static void
878 unroll_loop_runtime_iterations (struct loop *loop)
880 rtx old_niter, niter, tmp;
881 rtx_insn *init_code, *branch_code;
882 unsigned i, j, p;
883 basic_block preheader, *body, swtch, ezc_swtch;
884 sbitmap wont_exit;
885 int may_exit_copy;
886 unsigned n_peel;
887 edge e;
888 bool extra_zero_check, last_may_exit;
889 unsigned max_unroll = loop->lpt_decision.times;
890 struct niter_desc *desc = get_simple_loop_desc (loop);
891 bool exit_at_end = loop_exit_at_end_p (loop);
892 struct opt_info *opt_info = NULL;
893 bool ok;
895 if (flag_split_ivs_in_unroller
896 || flag_variable_expansion_in_unroller)
897 opt_info = analyze_insns_in_loop (loop);
899 /* Remember blocks whose dominators will have to be updated. */
900 auto_vec<basic_block> dom_bbs;
902 body = get_loop_body (loop);
903 for (i = 0; i < loop->num_nodes; i++)
905 vec<basic_block> ldom;
906 basic_block bb;
908 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
909 FOR_EACH_VEC_ELT (ldom, j, bb)
910 if (!flow_bb_inside_loop_p (loop, bb))
911 dom_bbs.safe_push (bb);
913 ldom.release ();
915 free (body);
917 if (!exit_at_end)
919 /* Leave exit in first copy (for explanation why see comment in
920 unroll_loop_constant_iterations). */
921 may_exit_copy = 0;
922 n_peel = max_unroll - 1;
923 extra_zero_check = true;
924 last_may_exit = false;
926 else
928 /* Leave exit in last copy (for explanation why see comment in
929 unroll_loop_constant_iterations). */
930 may_exit_copy = max_unroll;
931 n_peel = max_unroll;
932 extra_zero_check = false;
933 last_may_exit = true;
936 /* Get expression for number of iterations. */
937 start_sequence ();
938 old_niter = niter = gen_reg_rtx (desc->mode);
939 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
940 if (tmp != niter)
941 emit_move_insn (niter, tmp);
943 /* Count modulo by ANDing it with max_unroll; we use the fact that
944 the number of unrollings is a power of two, and thus this is correct
945 even if there is overflow in the computation. */
946 niter = expand_simple_binop (desc->mode, AND,
947 niter, gen_int_mode (max_unroll, desc->mode),
948 NULL_RTX, 0, OPTAB_LIB_WIDEN);
950 init_code = get_insns ();
951 end_sequence ();
952 unshare_all_rtl_in_chain (init_code);
954 /* Precondition the loop. */
955 split_edge_and_insert (loop_preheader_edge (loop), init_code);
957 auto_vec<edge> remove_edges;
959 wont_exit = sbitmap_alloc (max_unroll + 2);
961 /* Peel the first copy of loop body (almost always we must leave exit test
962 here; the only exception is when we have extra zero check and the number
963 of iterations is reliable. Also record the place of (possible) extra
964 zero check. */
965 bitmap_clear (wont_exit);
966 if (extra_zero_check
967 && !desc->noloop_assumptions)
968 bitmap_set_bit (wont_exit, 1);
969 ezc_swtch = loop_preheader_edge (loop)->src;
970 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
971 1, wont_exit, desc->out_edge,
972 &remove_edges,
973 DLTHE_FLAG_UPDATE_FREQ);
974 gcc_assert (ok);
976 /* Record the place where switch will be built for preconditioning. */
977 swtch = split_edge (loop_preheader_edge (loop));
979 for (i = 0; i < n_peel; i++)
981 /* Peel the copy. */
982 bitmap_clear (wont_exit);
983 if (i != n_peel - 1 || !last_may_exit)
984 bitmap_set_bit (wont_exit, 1);
985 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
986 1, wont_exit, desc->out_edge,
987 &remove_edges,
988 DLTHE_FLAG_UPDATE_FREQ);
989 gcc_assert (ok);
991 /* Create item for switch. */
992 j = n_peel - i - (extra_zero_check ? 0 : 1);
993 p = REG_BR_PROB_BASE / (i + 2);
995 preheader = split_edge (loop_preheader_edge (loop));
996 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
997 block_label (preheader), p,
998 NULL);
1000 /* We rely on the fact that the compare and jump cannot be optimized out,
1001 and hence the cfg we create is correct. */
1002 gcc_assert (branch_code != NULL_RTX);
1004 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1005 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1006 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1007 e = make_edge (swtch, preheader,
1008 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1009 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1010 e->probability = p;
1013 if (extra_zero_check)
1015 /* Add branch for zero iterations. */
1016 p = REG_BR_PROB_BASE / (max_unroll + 1);
1017 swtch = ezc_swtch;
1018 preheader = split_edge (loop_preheader_edge (loop));
1019 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1020 block_label (preheader), p,
1021 NULL);
1022 gcc_assert (branch_code != NULL_RTX);
1024 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1025 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1026 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1027 e = make_edge (swtch, preheader,
1028 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1029 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1030 e->probability = p;
1033 /* Recount dominators for outer blocks. */
1034 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1036 /* And unroll loop. */
1038 bitmap_ones (wont_exit);
1039 bitmap_clear_bit (wont_exit, may_exit_copy);
1040 opt_info_start_duplication (opt_info);
1042 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1043 max_unroll,
1044 wont_exit, desc->out_edge,
1045 &remove_edges,
1046 DLTHE_FLAG_UPDATE_FREQ
1047 | (opt_info
1048 ? DLTHE_RECORD_COPY_NUMBER
1049 : 0));
1050 gcc_assert (ok);
1052 if (opt_info)
1054 apply_opt_in_copies (opt_info, max_unroll, true, true);
1055 free_opt_info (opt_info);
1058 free (wont_exit);
1060 if (exit_at_end)
1062 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1063 /* Find a new in and out edge; they are in the last copy we have
1064 made. */
1066 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1068 desc->out_edge = EDGE_SUCC (exit_block, 0);
1069 desc->in_edge = EDGE_SUCC (exit_block, 1);
1071 else
1073 desc->out_edge = EDGE_SUCC (exit_block, 1);
1074 desc->in_edge = EDGE_SUCC (exit_block, 0);
1078 /* Remove the edges. */
1079 FOR_EACH_VEC_ELT (remove_edges, i, e)
1080 remove_path (e);
1082 /* We must be careful when updating the number of iterations due to
1083 preconditioning and the fact that the value must be valid at entry
1084 of the loop. After passing through the above code, we see that
1085 the correct new number of iterations is this: */
1086 gcc_assert (!desc->const_iter);
1087 desc->niter_expr =
1088 simplify_gen_binary (UDIV, desc->mode, old_niter,
1089 gen_int_mode (max_unroll + 1, desc->mode));
1090 loop->nb_iterations_upper_bound
1091 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1092 if (loop->any_estimate)
1093 loop->nb_iterations_estimate
1094 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1095 if (exit_at_end)
1097 desc->niter_expr =
1098 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1099 desc->noloop_assumptions = NULL_RTX;
1100 --loop->nb_iterations_upper_bound;
1101 if (loop->any_estimate
1102 && loop->nb_iterations_estimate != 0)
1103 --loop->nb_iterations_estimate;
1104 else
1105 loop->any_estimate = false;
1108 if (dump_file)
1109 fprintf (dump_file,
1110 ";; Unrolled loop %d times, counting # of iterations "
1111 "in runtime, %i insns\n",
1112 max_unroll, num_loop_insns (loop));
1115 /* Decide whether to unroll LOOP stupidly and how much. */
1116 static void
1117 decide_unroll_stupid (struct loop *loop, int flags)
1119 unsigned nunroll, nunroll_by_av, i;
1120 struct niter_desc *desc;
1121 widest_int iterations;
1123 if (!(flags & UAP_UNROLL_ALL))
1125 /* We were not asked to, just return back silently. */
1126 return;
1129 if (dump_file)
1130 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1132 /* nunroll = total number of copies of the original loop body in
1133 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1134 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1135 nunroll_by_av
1136 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1137 if (nunroll > nunroll_by_av)
1138 nunroll = nunroll_by_av;
1139 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1140 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1142 if (targetm.loop_unroll_adjust)
1143 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1145 /* Skip big loops. */
1146 if (nunroll <= 1)
1148 if (dump_file)
1149 fprintf (dump_file, ";; Not considering loop, is too big\n");
1150 return;
1153 /* Check for simple loops. */
1154 desc = get_simple_loop_desc (loop);
1156 /* Check simpleness. */
1157 if (desc->simple_p && !desc->assumptions)
1159 if (dump_file)
1160 fprintf (dump_file, ";; The loop is simple\n");
1161 return;
1164 /* Do not unroll loops with branches inside -- it increases number
1165 of mispredicts.
1166 TODO: this heuristic needs tunning; call inside the loop body
1167 is also relatively good reason to not unroll. */
1168 if (num_loop_branches (loop) > 1)
1170 if (dump_file)
1171 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1172 return;
1175 /* Check whether the loop rolls. */
1176 if ((get_estimated_loop_iterations (loop, &iterations)
1177 || get_max_loop_iterations (loop, &iterations))
1178 && wi::ltu_p (iterations, 2 * nunroll))
1180 if (dump_file)
1181 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1182 return;
1185 /* Success. Now force nunroll to be power of 2, as it seems that this
1186 improves results (partially because of better alignments, partially
1187 because of some dark magic). */
1188 for (i = 1; 2 * i <= nunroll; i *= 2)
1189 continue;
1191 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1192 loop->lpt_decision.times = i - 1;
1195 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1197 while (cond)
1198 body;
1200 ==> (LOOP->LPT_DECISION.TIMES == 3)
1202 while (cond)
1204 body;
1205 if (!cond) break;
1206 body;
1207 if (!cond) break;
1208 body;
1209 if (!cond) break;
1210 body;
1213 static void
1214 unroll_loop_stupid (struct loop *loop)
1216 sbitmap wont_exit;
1217 unsigned nunroll = loop->lpt_decision.times;
1218 struct niter_desc *desc = get_simple_loop_desc (loop);
1219 struct opt_info *opt_info = NULL;
1220 bool ok;
1222 if (flag_split_ivs_in_unroller
1223 || flag_variable_expansion_in_unroller)
1224 opt_info = analyze_insns_in_loop (loop);
1227 wont_exit = sbitmap_alloc (nunroll + 1);
1228 bitmap_clear (wont_exit);
1229 opt_info_start_duplication (opt_info);
1231 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1232 nunroll, wont_exit,
1233 NULL, NULL,
1234 DLTHE_FLAG_UPDATE_FREQ
1235 | (opt_info
1236 ? DLTHE_RECORD_COPY_NUMBER
1237 : 0));
1238 gcc_assert (ok);
1240 if (opt_info)
1242 apply_opt_in_copies (opt_info, nunroll, true, true);
1243 free_opt_info (opt_info);
1246 free (wont_exit);
1248 if (desc->simple_p)
1250 /* We indeed may get here provided that there are nontrivial assumptions
1251 for a loop to be really simple. We could update the counts, but the
1252 problem is that we are unable to decide which exit will be taken
1253 (not really true in case the number of iterations is constant,
1254 but no one will do anything with this information, so we do not
1255 worry about it). */
1256 desc->simple_p = false;
1259 if (dump_file)
1260 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1261 nunroll, num_loop_insns (loop));
1264 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1265 Set *DEBUG_USES to the number of debug insns that reference the
1266 variable. */
1268 static bool
1269 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1270 int *debug_uses)
1272 basic_block *body, bb;
1273 unsigned i;
1274 int count_ref = 0;
1275 rtx_insn *insn;
1277 body = get_loop_body (loop);
1278 for (i = 0; i < loop->num_nodes; i++)
1280 bb = body[i];
1282 FOR_BB_INSNS (bb, insn)
1283 if (!rtx_referenced_p (reg, insn))
1284 continue;
1285 else if (DEBUG_INSN_P (insn))
1286 ++*debug_uses;
1287 else if (++count_ref > 1)
1288 break;
1290 free (body);
1291 return (count_ref == 1);
1294 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1296 static void
1297 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1299 basic_block *body, bb;
1300 unsigned i;
1301 rtx_insn *insn;
1303 body = get_loop_body (loop);
1304 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1306 bb = body[i];
1308 FOR_BB_INSNS (bb, insn)
1309 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1310 continue;
1311 else
1313 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1314 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1315 if (!--debug_uses)
1316 break;
1319 free (body);
1322 /* Determine whether INSN contains an accumulator
1323 which can be expanded into separate copies,
1324 one for each copy of the LOOP body.
1326 for (i = 0 ; i < n; i++)
1327 sum += a[i];
1331 sum += a[i]
1332 ....
1333 i = i+1;
1334 sum1 += a[i]
1335 ....
1336 i = i+1
1337 sum2 += a[i];
1338 ....
1340 Return NULL if INSN contains no opportunity for expansion of accumulator.
1341 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1342 information and return a pointer to it.
1345 static struct var_to_expand *
1346 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1348 rtx set, dest, src;
1349 struct var_to_expand *ves;
1350 unsigned accum_pos;
1351 enum rtx_code code;
1352 int debug_uses = 0;
1354 set = single_set (insn);
1355 if (!set)
1356 return NULL;
1358 dest = SET_DEST (set);
1359 src = SET_SRC (set);
1360 code = GET_CODE (src);
1362 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1363 return NULL;
1365 if (FLOAT_MODE_P (GET_MODE (dest)))
1367 if (!flag_associative_math)
1368 return NULL;
1369 /* In the case of FMA, we're also changing the rounding. */
1370 if (code == FMA && !flag_unsafe_math_optimizations)
1371 return NULL;
1374 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1375 in MD. But if there is no optab to generate the insn, we can not
1376 perform the variable expansion. This can happen if an MD provides
1377 an insn but not a named pattern to generate it, for example to avoid
1378 producing code that needs additional mode switches like for x87/mmx.
1380 So we check have_insn_for which looks for an optab for the operation
1381 in SRC. If it doesn't exist, we can't perform the expansion even
1382 though INSN is valid. */
1383 if (!have_insn_for (code, GET_MODE (src)))
1384 return NULL;
1386 if (!REG_P (dest)
1387 && !(GET_CODE (dest) == SUBREG
1388 && REG_P (SUBREG_REG (dest))))
1389 return NULL;
1391 /* Find the accumulator use within the operation. */
1392 if (code == FMA)
1394 /* We only support accumulation via FMA in the ADD position. */
1395 if (!rtx_equal_p (dest, XEXP (src, 2)))
1396 return NULL;
1397 accum_pos = 2;
1399 else if (rtx_equal_p (dest, XEXP (src, 0)))
1400 accum_pos = 0;
1401 else if (rtx_equal_p (dest, XEXP (src, 1)))
1403 /* The method of expansion that we are using; which includes the
1404 initialization of the expansions with zero and the summation of
1405 the expansions at the end of the computation will yield wrong
1406 results for (x = something - x) thus avoid using it in that case. */
1407 if (code == MINUS)
1408 return NULL;
1409 accum_pos = 1;
1411 else
1412 return NULL;
1414 /* It must not otherwise be used. */
1415 if (code == FMA)
1417 if (rtx_referenced_p (dest, XEXP (src, 0))
1418 || rtx_referenced_p (dest, XEXP (src, 1)))
1419 return NULL;
1421 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1422 return NULL;
1424 /* It must be used in exactly one insn. */
1425 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1426 return NULL;
1428 if (dump_file)
1430 fprintf (dump_file, "\n;; Expanding Accumulator ");
1431 print_rtl (dump_file, dest);
1432 fprintf (dump_file, "\n");
1435 if (debug_uses)
1436 /* Instead of resetting the debug insns, we could replace each
1437 debug use in the loop with the sum or product of all expanded
1438 accummulators. Since we'll only know of all expansions at the
1439 end, we'd have to keep track of which vars_to_expand a debug
1440 insn in the loop references, take note of each copy of the
1441 debug insn during unrolling, and when it's all done, compute
1442 the sum or product of each variable and adjust the original
1443 debug insn and each copy thereof. What a pain! */
1444 reset_debug_uses_in_loop (loop, dest, debug_uses);
1446 /* Record the accumulator to expand. */
1447 ves = XNEW (struct var_to_expand);
1448 ves->insn = insn;
1449 ves->reg = copy_rtx (dest);
1450 ves->var_expansions.create (1);
1451 ves->next = NULL;
1452 ves->op = GET_CODE (src);
1453 ves->expansion_count = 0;
1454 ves->reuse_expansion = 0;
1455 return ves;
1458 /* Determine whether there is an induction variable in INSN that
1459 we would like to split during unrolling.
1461 I.e. replace
1463 i = i + 1;
1465 i = i + 1;
1467 i = i + 1;
1470 type chains by
1472 i0 = i + 1
1474 i = i0 + 1
1476 i = i0 + 2
1479 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1480 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1481 pointer to it. */
1483 static struct iv_to_split *
1484 analyze_iv_to_split_insn (rtx_insn *insn)
1486 rtx set, dest;
1487 struct rtx_iv iv;
1488 struct iv_to_split *ivts;
1489 bool ok;
1491 /* For now we just split the basic induction variables. Later this may be
1492 extended for example by selecting also addresses of memory references. */
1493 set = single_set (insn);
1494 if (!set)
1495 return NULL;
1497 dest = SET_DEST (set);
1498 if (!REG_P (dest))
1499 return NULL;
1501 if (!biv_p (insn, dest))
1502 return NULL;
1504 ok = iv_analyze_result (insn, dest, &iv);
1506 /* This used to be an assert under the assumption that if biv_p returns
1507 true that iv_analyze_result must also return true. However, that
1508 assumption is not strictly correct as evidenced by pr25569.
1510 Returning NULL when iv_analyze_result returns false is safe and
1511 avoids the problems in pr25569 until the iv_analyze_* routines
1512 can be fixed, which is apparently hard and time consuming
1513 according to their author. */
1514 if (! ok)
1515 return NULL;
1517 if (iv.step == const0_rtx
1518 || iv.mode != iv.extend_mode)
1519 return NULL;
1521 /* Record the insn to split. */
1522 ivts = XNEW (struct iv_to_split);
1523 ivts->insn = insn;
1524 ivts->orig_var = dest;
1525 ivts->base_var = NULL_RTX;
1526 ivts->step = iv.step;
1527 ivts->next = NULL;
1529 return ivts;
1532 /* Determines which of insns in LOOP can be optimized.
1533 Return a OPT_INFO struct with the relevant hash tables filled
1534 with all insns to be optimized. The FIRST_NEW_BLOCK field
1535 is undefined for the return value. */
1537 static struct opt_info *
1538 analyze_insns_in_loop (struct loop *loop)
1540 basic_block *body, bb;
1541 unsigned i;
1542 struct opt_info *opt_info = XCNEW (struct opt_info);
1543 rtx_insn *insn;
1544 struct iv_to_split *ivts = NULL;
1545 struct var_to_expand *ves = NULL;
1546 iv_to_split **slot1;
1547 var_to_expand **slot2;
1548 vec<edge> edges = get_loop_exit_edges (loop);
1549 edge exit;
1550 bool can_apply = false;
1552 iv_analysis_loop_init (loop);
1554 body = get_loop_body (loop);
1556 if (flag_split_ivs_in_unroller)
1558 opt_info->insns_to_split
1559 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1560 opt_info->iv_to_split_head = NULL;
1561 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1564 /* Record the loop exit bb and loop preheader before the unrolling. */
1565 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1567 if (edges.length () == 1)
1569 exit = edges[0];
1570 if (!(exit->flags & EDGE_COMPLEX))
1572 opt_info->loop_exit = split_edge (exit);
1573 can_apply = true;
1577 if (flag_variable_expansion_in_unroller
1578 && can_apply)
1580 opt_info->insns_with_var_to_expand
1581 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1582 opt_info->var_to_expand_head = NULL;
1583 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1586 for (i = 0; i < loop->num_nodes; i++)
1588 bb = body[i];
1589 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1590 continue;
1592 FOR_BB_INSNS (bb, insn)
1594 if (!INSN_P (insn))
1595 continue;
1597 if (opt_info->insns_to_split)
1598 ivts = analyze_iv_to_split_insn (insn);
1600 if (ivts)
1602 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1603 gcc_assert (*slot1 == NULL);
1604 *slot1 = ivts;
1605 *opt_info->iv_to_split_tail = ivts;
1606 opt_info->iv_to_split_tail = &ivts->next;
1607 continue;
1610 if (opt_info->insns_with_var_to_expand)
1611 ves = analyze_insn_to_expand_var (loop, insn);
1613 if (ves)
1615 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1616 gcc_assert (*slot2 == NULL);
1617 *slot2 = ves;
1618 *opt_info->var_to_expand_tail = ves;
1619 opt_info->var_to_expand_tail = &ves->next;
1624 edges.release ();
1625 free (body);
1626 return opt_info;
1629 /* Called just before loop duplication. Records start of duplicated area
1630 to OPT_INFO. */
1632 static void
1633 opt_info_start_duplication (struct opt_info *opt_info)
1635 if (opt_info)
1636 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1639 /* Determine the number of iterations between initialization of the base
1640 variable and the current copy (N_COPY). N_COPIES is the total number
1641 of newly created copies. UNROLLING is true if we are unrolling
1642 (not peeling) the loop. */
1644 static unsigned
1645 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1647 if (unrolling)
1649 /* If we are unrolling, initialization is done in the original loop
1650 body (number 0). */
1651 return n_copy;
1653 else
1655 /* If we are peeling, the copy in that the initialization occurs has
1656 number 1. The original loop (number 0) is the last. */
1657 if (n_copy)
1658 return n_copy - 1;
1659 else
1660 return n_copies;
1664 /* Allocate basic variable for the induction variable chain. */
1666 static void
1667 allocate_basic_variable (struct iv_to_split *ivts)
1669 rtx expr = SET_SRC (single_set (ivts->insn));
1671 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1674 /* Insert initialization of basic variable of IVTS before INSN, taking
1675 the initial value from INSN. */
1677 static void
1678 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1680 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1681 rtx_insn *seq;
1683 start_sequence ();
1684 expr = force_operand (expr, ivts->base_var);
1685 if (expr != ivts->base_var)
1686 emit_move_insn (ivts->base_var, expr);
1687 seq = get_insns ();
1688 end_sequence ();
1690 emit_insn_before (seq, insn);
1693 /* Replace the use of induction variable described in IVTS in INSN
1694 by base variable + DELTA * step. */
1696 static void
1697 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1699 rtx expr, *loc, incr, var;
1700 rtx_insn *seq;
1701 machine_mode mode = GET_MODE (ivts->base_var);
1702 rtx src, dest, set;
1704 /* Construct base + DELTA * step. */
1705 if (!delta)
1706 expr = ivts->base_var;
1707 else
1709 incr = simplify_gen_binary (MULT, mode,
1710 ivts->step, gen_int_mode (delta, mode));
1711 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1712 ivts->base_var, incr);
1715 /* Figure out where to do the replacement. */
1716 loc = &SET_SRC (single_set (insn));
1718 /* If we can make the replacement right away, we're done. */
1719 if (validate_change (insn, loc, expr, 0))
1720 return;
1722 /* Otherwise, force EXPR into a register and try again. */
1723 start_sequence ();
1724 var = gen_reg_rtx (mode);
1725 expr = force_operand (expr, var);
1726 if (expr != var)
1727 emit_move_insn (var, expr);
1728 seq = get_insns ();
1729 end_sequence ();
1730 emit_insn_before (seq, insn);
1732 if (validate_change (insn, loc, var, 0))
1733 return;
1735 /* The last chance. Try recreating the assignment in insn
1736 completely from scratch. */
1737 set = single_set (insn);
1738 gcc_assert (set);
1740 start_sequence ();
1741 *loc = var;
1742 src = copy_rtx (SET_SRC (set));
1743 dest = copy_rtx (SET_DEST (set));
1744 src = force_operand (src, dest);
1745 if (src != dest)
1746 emit_move_insn (dest, src);
1747 seq = get_insns ();
1748 end_sequence ();
1750 emit_insn_before (seq, insn);
1751 delete_insn (insn);
1755 /* Return one expansion of the accumulator recorded in struct VE. */
1757 static rtx
1758 get_expansion (struct var_to_expand *ve)
1760 rtx reg;
1762 if (ve->reuse_expansion == 0)
1763 reg = ve->reg;
1764 else
1765 reg = ve->var_expansions[ve->reuse_expansion - 1];
1767 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1768 ve->reuse_expansion = 0;
1769 else
1770 ve->reuse_expansion++;
1772 return reg;
1776 /* Given INSN replace the uses of the accumulator recorded in VE
1777 with a new register. */
1779 static void
1780 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1782 rtx new_reg, set;
1783 bool really_new_expansion = false;
1785 set = single_set (insn);
1786 gcc_assert (set);
1788 /* Generate a new register only if the expansion limit has not been
1789 reached. Else reuse an already existing expansion. */
1790 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1792 really_new_expansion = true;
1793 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1795 else
1796 new_reg = get_expansion (ve);
1798 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1799 if (apply_change_group ())
1800 if (really_new_expansion)
1802 ve->var_expansions.safe_push (new_reg);
1803 ve->expansion_count++;
1807 /* Initialize the variable expansions in loop preheader. PLACE is the
1808 loop-preheader basic block where the initialization of the
1809 expansions should take place. The expansions are initialized with
1810 (-0) when the operation is plus or minus to honor sign zero. This
1811 way we can prevent cases where the sign of the final result is
1812 effected by the sign of the expansion. Here is an example to
1813 demonstrate this:
1815 for (i = 0 ; i < n; i++)
1816 sum += something;
1820 sum += something
1821 ....
1822 i = i+1;
1823 sum1 += something
1824 ....
1825 i = i+1
1826 sum2 += something;
1827 ....
1829 When SUM is initialized with -zero and SOMETHING is also -zero; the
1830 final result of sum should be -zero thus the expansions sum1 and sum2
1831 should be initialized with -zero as well (otherwise we will get +zero
1832 as the final result). */
1834 static void
1835 insert_var_expansion_initialization (struct var_to_expand *ve,
1836 basic_block place)
1838 rtx_insn *seq;
1839 rtx var, zero_init;
1840 unsigned i;
1841 machine_mode mode = GET_MODE (ve->reg);
1842 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1844 if (ve->var_expansions.length () == 0)
1845 return;
1847 start_sequence ();
1848 switch (ve->op)
1850 case FMA:
1851 /* Note that we only accumulate FMA via the ADD operand. */
1852 case PLUS:
1853 case MINUS:
1854 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1856 if (honor_signed_zero_p)
1857 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1858 else
1859 zero_init = CONST0_RTX (mode);
1860 emit_move_insn (var, zero_init);
1862 break;
1864 case MULT:
1865 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1867 zero_init = CONST1_RTX (GET_MODE (var));
1868 emit_move_insn (var, zero_init);
1870 break;
1872 default:
1873 gcc_unreachable ();
1876 seq = get_insns ();
1877 end_sequence ();
1879 emit_insn_after (seq, BB_END (place));
1882 /* Combine the variable expansions at the loop exit. PLACE is the
1883 loop exit basic block where the summation of the expansions should
1884 take place. */
1886 static void
1887 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1889 rtx sum = ve->reg;
1890 rtx expr, var;
1891 rtx_insn *seq, *insn;
1892 unsigned i;
1894 if (ve->var_expansions.length () == 0)
1895 return;
1897 start_sequence ();
1898 switch (ve->op)
1900 case FMA:
1901 /* Note that we only accumulate FMA via the ADD operand. */
1902 case PLUS:
1903 case MINUS:
1904 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1905 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1906 break;
1908 case MULT:
1909 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1910 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1911 break;
1913 default:
1914 gcc_unreachable ();
1917 expr = force_operand (sum, ve->reg);
1918 if (expr != ve->reg)
1919 emit_move_insn (ve->reg, expr);
1920 seq = get_insns ();
1921 end_sequence ();
1923 insn = BB_HEAD (place);
1924 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1925 insn = NEXT_INSN (insn);
1927 emit_insn_after (seq, insn);
1930 /* Strip away REG_EQUAL notes for IVs we're splitting.
1932 Updating REG_EQUAL notes for IVs we split is tricky: We
1933 cannot tell until after unrolling, DF-rescanning, and liveness
1934 updating, whether an EQ_USE is reached by the split IV while
1935 the IV reg is still live. See PR55006.
1937 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1938 because RTL loop-iv requires us to defer rescanning insns and
1939 any notes attached to them. So resort to old techniques... */
1941 static void
1942 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1944 struct iv_to_split *ivts;
1945 rtx note = find_reg_equal_equiv_note (insn);
1946 if (! note)
1947 return;
1948 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1949 if (reg_mentioned_p (ivts->orig_var, note))
1951 remove_note (insn, note);
1952 return;
1956 /* Apply loop optimizations in loop copies using the
1957 data which gathered during the unrolling. Structure
1958 OPT_INFO record that data.
1960 UNROLLING is true if we unrolled (not peeled) the loop.
1961 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1962 the loop (as it should happen in complete unrolling, but not in ordinary
1963 peeling of the loop). */
1965 static void
1966 apply_opt_in_copies (struct opt_info *opt_info,
1967 unsigned n_copies, bool unrolling,
1968 bool rewrite_original_loop)
1970 unsigned i, delta;
1971 basic_block bb, orig_bb;
1972 rtx_insn *insn, *orig_insn, *next;
1973 struct iv_to_split ivts_templ, *ivts;
1974 struct var_to_expand ve_templ, *ves;
1976 /* Sanity check -- we need to put initialization in the original loop
1977 body. */
1978 gcc_assert (!unrolling || rewrite_original_loop);
1980 /* Allocate the basic variables (i0). */
1981 if (opt_info->insns_to_split)
1982 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1983 allocate_basic_variable (ivts);
1985 for (i = opt_info->first_new_block;
1986 i < (unsigned) last_basic_block_for_fn (cfun);
1987 i++)
1989 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1990 orig_bb = get_bb_original (bb);
1992 /* bb->aux holds position in copy sequence initialized by
1993 duplicate_loop_to_header_edge. */
1994 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1995 unrolling);
1996 bb->aux = 0;
1997 orig_insn = BB_HEAD (orig_bb);
1998 FOR_BB_INSNS_SAFE (bb, insn, next)
2000 if (!INSN_P (insn)
2001 || (DEBUG_INSN_P (insn)
2002 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2003 continue;
2005 while (!INSN_P (orig_insn)
2006 || (DEBUG_INSN_P (orig_insn)
2007 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2008 == LABEL_DECL)))
2009 orig_insn = NEXT_INSN (orig_insn);
2011 ivts_templ.insn = orig_insn;
2012 ve_templ.insn = orig_insn;
2014 /* Apply splitting iv optimization. */
2015 if (opt_info->insns_to_split)
2017 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2019 ivts = opt_info->insns_to_split->find (&ivts_templ);
2021 if (ivts)
2023 gcc_assert (GET_CODE (PATTERN (insn))
2024 == GET_CODE (PATTERN (orig_insn)));
2026 if (!delta)
2027 insert_base_initialization (ivts, insn);
2028 split_iv (ivts, insn, delta);
2031 /* Apply variable expansion optimization. */
2032 if (unrolling && opt_info->insns_with_var_to_expand)
2034 ves = (struct var_to_expand *)
2035 opt_info->insns_with_var_to_expand->find (&ve_templ);
2036 if (ves)
2038 gcc_assert (GET_CODE (PATTERN (insn))
2039 == GET_CODE (PATTERN (orig_insn)));
2040 expand_var_during_unrolling (ves, insn);
2043 orig_insn = NEXT_INSN (orig_insn);
2047 if (!rewrite_original_loop)
2048 return;
2050 /* Initialize the variable expansions in the loop preheader
2051 and take care of combining them at the loop exit. */
2052 if (opt_info->insns_with_var_to_expand)
2054 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2055 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2056 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2057 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2060 /* Rewrite also the original loop body. Find them as originals of the blocks
2061 in the last copied iteration, i.e. those that have
2062 get_bb_copy (get_bb_original (bb)) == bb. */
2063 for (i = opt_info->first_new_block;
2064 i < (unsigned) last_basic_block_for_fn (cfun);
2065 i++)
2067 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2068 orig_bb = get_bb_original (bb);
2069 if (get_bb_copy (orig_bb) != bb)
2070 continue;
2072 delta = determine_split_iv_delta (0, n_copies, unrolling);
2073 for (orig_insn = BB_HEAD (orig_bb);
2074 orig_insn != NEXT_INSN (BB_END (bb));
2075 orig_insn = next)
2077 next = NEXT_INSN (orig_insn);
2079 if (!INSN_P (orig_insn))
2080 continue;
2082 ivts_templ.insn = orig_insn;
2083 if (opt_info->insns_to_split)
2085 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2087 ivts = (struct iv_to_split *)
2088 opt_info->insns_to_split->find (&ivts_templ);
2089 if (ivts)
2091 if (!delta)
2092 insert_base_initialization (ivts, orig_insn);
2093 split_iv (ivts, orig_insn, delta);
2094 continue;
2102 /* Release OPT_INFO. */
2104 static void
2105 free_opt_info (struct opt_info *opt_info)
2107 delete opt_info->insns_to_split;
2108 opt_info->insns_to_split = NULL;
2109 if (opt_info->insns_with_var_to_expand)
2111 struct var_to_expand *ves;
2113 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2114 ves->var_expansions.release ();
2115 delete opt_info->insns_with_var_to_expand;
2116 opt_info->insns_with_var_to_expand = NULL;
2118 free (opt_info);