* doc/extend.texi: Use @pxref instead of @xref.
[official-gcc.git] / gcc / loop-unroll.c
blobf1d2ea5c7b4c5054984c07940ff4c0c79686cfb0
1 /* Loop unrolling.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "hash-set.h"
26 #include "machmode.h"
27 #include "vec.h"
28 #include "double-int.h"
29 #include "input.h"
30 #include "alias.h"
31 #include "symtab.h"
32 #include "wide-int.h"
33 #include "inchash.h"
34 #include "tree.h"
35 #include "hard-reg-set.h"
36 #include "obstack.h"
37 #include "profile.h"
38 #include "predict.h"
39 #include "function.h"
40 #include "dominance.h"
41 #include "cfg.h"
42 #include "cfgrtl.h"
43 #include "basic-block.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "insn-codes.h"
47 #include "optabs.h"
48 #include "hashtab.h"
49 #include "flags.h"
50 #include "statistics.h"
51 #include "real.h"
52 #include "fixed-value.h"
53 #include "insn-config.h"
54 #include "expmed.h"
55 #include "dojump.h"
56 #include "explow.h"
57 #include "calls.h"
58 #include "emit-rtl.h"
59 #include "varasm.h"
60 #include "stmt.h"
61 #include "expr.h"
62 #include "hash-table.h"
63 #include "recog.h"
64 #include "target.h"
65 #include "dumpfile.h"
67 /* This pass performs loop unrolling. We only perform this
68 optimization on innermost loops (with single exception) because
69 the impact on performance is greatest here, and we want to avoid
70 unnecessary code size growth. The gain is caused by greater sequentiality
71 of code, better code to optimize for further passes and in some cases
72 by fewer testings of exit conditions. The main problem is code growth,
73 that impacts performance negatively due to effect of caches.
75 What we do:
77 -- unrolling of loops that roll constant times; this is almost always
78 win, as we get rid of exit condition tests.
79 -- unrolling of loops that roll number of times that we can compute
80 in runtime; we also get rid of exit condition tests here, but there
81 is the extra expense for calculating the number of iterations
82 -- simple unrolling of remaining loops; this is performed only if we
83 are asked to, as the gain is questionable in this case and often
84 it may even slow down the code
85 For more detailed descriptions of each of those, see comments at
86 appropriate function below.
88 There is a lot of parameters (defined and described in params.def) that
89 control how much we unroll.
91 ??? A great problem is that we don't have a good way how to determine
92 how many times we should unroll the loop; the experiments I have made
93 showed that this choice may affect performance in order of several %.
96 /* Information about induction variables to split. */
98 struct iv_to_split
100 rtx_insn *insn; /* The insn in that the induction variable occurs. */
101 rtx orig_var; /* The variable (register) for the IV before split. */
102 rtx base_var; /* The variable on that the values in the further
103 iterations are based. */
104 rtx step; /* Step of the induction variable. */
105 struct iv_to_split *next; /* Next entry in walking order. */
108 /* Information about accumulators to expand. */
110 struct var_to_expand
112 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
113 rtx reg; /* The accumulator which is expanded. */
114 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
115 struct var_to_expand *next; /* Next entry in walking order. */
116 enum rtx_code op; /* The type of the accumulation - addition, subtraction
117 or multiplication. */
118 int expansion_count; /* Count the number of expansions generated so far. */
119 int reuse_expansion; /* The expansion we intend to reuse to expand
120 the accumulator. If REUSE_EXPANSION is 0 reuse
121 the original accumulator. Else use
122 var_expansions[REUSE_EXPANSION - 1]. */
125 /* Hashtable helper for iv_to_split. */
127 struct iv_split_hasher : typed_free_remove <iv_to_split>
129 typedef iv_to_split *value_type;
130 typedef iv_to_split *compare_type;
131 static inline hashval_t hash (const iv_to_split *);
132 static inline bool equal (const iv_to_split *, const iv_to_split *);
136 /* A hash function for information about insns to split. */
138 inline hashval_t
139 iv_split_hasher::hash (const iv_to_split *ivts)
141 return (hashval_t) INSN_UID (ivts->insn);
144 /* An equality functions for information about insns to split. */
146 inline bool
147 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
149 return i1->insn == i2->insn;
152 /* Hashtable helper for iv_to_split. */
154 struct var_expand_hasher : typed_free_remove <var_to_expand>
156 typedef var_to_expand *value_type;
157 typedef var_to_expand *compare_type;
158 static inline hashval_t hash (const var_to_expand *);
159 static inline bool equal (const var_to_expand *, const var_to_expand *);
162 /* Return a hash for VES. */
164 inline hashval_t
165 var_expand_hasher::hash (const var_to_expand *ves)
167 return (hashval_t) INSN_UID (ves->insn);
170 /* Return true if I1 and I2 refer to the same instruction. */
172 inline bool
173 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
175 return i1->insn == i2->insn;
178 /* Information about optimization applied in
179 the unrolled loop. */
181 struct opt_info
183 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
184 split. */
185 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
186 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
187 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
188 insns with accumulators to expand. */
189 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
190 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
191 unsigned first_new_block; /* The first basic block that was
192 duplicated. */
193 basic_block loop_exit; /* The loop exit basic block. */
194 basic_block loop_preheader; /* The loop preheader basic block. */
197 static void decide_unroll_stupid (struct loop *, int);
198 static void decide_unroll_constant_iterations (struct loop *, int);
199 static void decide_unroll_runtime_iterations (struct loop *, int);
200 static void unroll_loop_stupid (struct loop *);
201 static void decide_unrolling (int);
202 static void unroll_loop_constant_iterations (struct loop *);
203 static void unroll_loop_runtime_iterations (struct loop *);
204 static struct opt_info *analyze_insns_in_loop (struct loop *);
205 static void opt_info_start_duplication (struct opt_info *);
206 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
207 static void free_opt_info (struct opt_info *);
208 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
209 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
210 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
211 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
212 static void insert_var_expansion_initialization (struct var_to_expand *,
213 basic_block);
214 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
215 basic_block);
216 static rtx get_expansion (struct var_to_expand *);
218 /* Emit a message summarizing the unroll that will be
219 performed for LOOP, along with the loop's location LOCUS, if
220 appropriate given the dump or -fopt-info settings. */
222 static void
223 report_unroll (struct loop *loop, location_t locus)
225 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
227 if (loop->lpt_decision.decision == LPT_NONE)
228 return;
230 if (!dump_enabled_p ())
231 return;
233 dump_printf_loc (report_flags, locus,
234 "loop unrolled %d times",
235 loop->lpt_decision.times);
236 if (profile_info)
237 dump_printf (report_flags,
238 " (header execution count %d)",
239 (int)loop->header->count);
241 dump_printf (report_flags, "\n");
244 /* Decide whether unroll loops and how much. */
245 static void
246 decide_unrolling (int flags)
248 struct loop *loop;
250 /* Scan the loops, inner ones first. */
251 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
253 loop->lpt_decision.decision = LPT_NONE;
254 location_t locus = get_loop_location (loop);
256 if (dump_enabled_p ())
257 dump_printf_loc (TDF_RTL, locus,
258 ";; *** Considering loop %d at BB %d for "
259 "unrolling ***\n",
260 loop->num, loop->header->index);
262 /* Do not peel cold areas. */
263 if (optimize_loop_for_size_p (loop))
265 if (dump_file)
266 fprintf (dump_file, ";; Not considering loop, cold area\n");
267 continue;
270 /* Can the loop be manipulated? */
271 if (!can_duplicate_loop_p (loop))
273 if (dump_file)
274 fprintf (dump_file,
275 ";; Not considering loop, cannot duplicate\n");
276 continue;
279 /* Skip non-innermost loops. */
280 if (loop->inner)
282 if (dump_file)
283 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
284 continue;
287 loop->ninsns = num_loop_insns (loop);
288 loop->av_ninsns = average_num_loop_insns (loop);
290 /* Try transformations one by one in decreasing order of
291 priority. */
293 decide_unroll_constant_iterations (loop, flags);
294 if (loop->lpt_decision.decision == LPT_NONE)
295 decide_unroll_runtime_iterations (loop, flags);
296 if (loop->lpt_decision.decision == LPT_NONE)
297 decide_unroll_stupid (loop, flags);
299 report_unroll (loop, locus);
303 /* Unroll LOOPS. */
304 void
305 unroll_loops (int flags)
307 struct loop *loop;
308 bool changed = false;
310 /* Now decide rest of unrolling. */
311 decide_unrolling (flags);
313 /* Scan the loops, inner ones first. */
314 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
316 /* And perform the appropriate transformations. */
317 switch (loop->lpt_decision.decision)
319 case LPT_UNROLL_CONSTANT:
320 unroll_loop_constant_iterations (loop);
321 changed = true;
322 break;
323 case LPT_UNROLL_RUNTIME:
324 unroll_loop_runtime_iterations (loop);
325 changed = true;
326 break;
327 case LPT_UNROLL_STUPID:
328 unroll_loop_stupid (loop);
329 changed = true;
330 break;
331 case LPT_NONE:
332 break;
333 default:
334 gcc_unreachable ();
338 if (changed)
340 calculate_dominance_info (CDI_DOMINATORS);
341 fix_loop_structure (NULL);
344 iv_analysis_done ();
347 /* Check whether exit of the LOOP is at the end of loop body. */
349 static bool
350 loop_exit_at_end_p (struct loop *loop)
352 struct niter_desc *desc = get_simple_loop_desc (loop);
353 rtx_insn *insn;
355 /* We should never have conditional in latch block. */
356 gcc_assert (desc->in_edge->dest != loop->header);
358 if (desc->in_edge->dest != loop->latch)
359 return false;
361 /* Check that the latch is empty. */
362 FOR_BB_INSNS (loop->latch, insn)
364 if (INSN_P (insn) && active_insn_p (insn))
365 return false;
368 return true;
371 /* Decide whether to unroll LOOP iterating constant number of times
372 and how much. */
374 static void
375 decide_unroll_constant_iterations (struct loop *loop, int flags)
377 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
378 struct niter_desc *desc;
379 widest_int iterations;
381 if (!(flags & UAP_UNROLL))
383 /* We were not asked to, just return back silently. */
384 return;
387 if (dump_file)
388 fprintf (dump_file,
389 "\n;; Considering unrolling loop with constant "
390 "number of iterations\n");
392 /* nunroll = total number of copies of the original loop body in
393 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
394 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
395 nunroll_by_av
396 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
397 if (nunroll > nunroll_by_av)
398 nunroll = nunroll_by_av;
399 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
400 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
402 if (targetm.loop_unroll_adjust)
403 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
405 /* Skip big loops. */
406 if (nunroll <= 1)
408 if (dump_file)
409 fprintf (dump_file, ";; Not considering loop, is too big\n");
410 return;
413 /* Check for simple loops. */
414 desc = get_simple_loop_desc (loop);
416 /* Check number of iterations. */
417 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
419 if (dump_file)
420 fprintf (dump_file,
421 ";; Unable to prove that the loop iterates constant times\n");
422 return;
425 /* Check whether the loop rolls enough to consider.
426 Consult also loop bounds and profile; in the case the loop has more
427 than one exit it may well loop less than determined maximal number
428 of iterations. */
429 if (desc->niter < 2 * nunroll
430 || ((get_estimated_loop_iterations (loop, &iterations)
431 || get_max_loop_iterations (loop, &iterations))
432 && wi::ltu_p (iterations, 2 * nunroll)))
434 if (dump_file)
435 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
436 return;
439 /* Success; now compute number of iterations to unroll. We alter
440 nunroll so that as few as possible copies of loop body are
441 necessary, while still not decreasing the number of unrollings
442 too much (at most by 1). */
443 best_copies = 2 * nunroll + 10;
445 i = 2 * nunroll + 2;
446 if (i - 1 >= desc->niter)
447 i = desc->niter - 2;
449 for (; i >= nunroll - 1; i--)
451 unsigned exit_mod = desc->niter % (i + 1);
453 if (!loop_exit_at_end_p (loop))
454 n_copies = exit_mod + i + 1;
455 else if (exit_mod != (unsigned) i
456 || desc->noloop_assumptions != NULL_RTX)
457 n_copies = exit_mod + i + 2;
458 else
459 n_copies = i + 1;
461 if (n_copies < best_copies)
463 best_copies = n_copies;
464 best_unroll = i;
468 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
469 loop->lpt_decision.times = best_unroll;
472 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
473 The transformation does this:
475 for (i = 0; i < 102; i++)
476 body;
478 ==> (LOOP->LPT_DECISION.TIMES == 3)
480 i = 0;
481 body; i++;
482 body; i++;
483 while (i < 102)
485 body; i++;
486 body; i++;
487 body; i++;
488 body; i++;
491 static void
492 unroll_loop_constant_iterations (struct loop *loop)
494 unsigned HOST_WIDE_INT niter;
495 unsigned exit_mod;
496 sbitmap wont_exit;
497 unsigned i;
498 edge e;
499 unsigned max_unroll = loop->lpt_decision.times;
500 struct niter_desc *desc = get_simple_loop_desc (loop);
501 bool exit_at_end = loop_exit_at_end_p (loop);
502 struct opt_info *opt_info = NULL;
503 bool ok;
505 niter = desc->niter;
507 /* Should not get here (such loop should be peeled instead). */
508 gcc_assert (niter > max_unroll + 1);
510 exit_mod = niter % (max_unroll + 1);
512 wont_exit = sbitmap_alloc (max_unroll + 1);
513 bitmap_ones (wont_exit);
515 auto_vec<edge> remove_edges;
516 if (flag_split_ivs_in_unroller
517 || flag_variable_expansion_in_unroller)
518 opt_info = analyze_insns_in_loop (loop);
520 if (!exit_at_end)
522 /* The exit is not at the end of the loop; leave exit test
523 in the first copy, so that the loops that start with test
524 of exit condition have continuous body after unrolling. */
526 if (dump_file)
527 fprintf (dump_file, ";; Condition at beginning of loop.\n");
529 /* Peel exit_mod iterations. */
530 bitmap_clear_bit (wont_exit, 0);
531 if (desc->noloop_assumptions)
532 bitmap_clear_bit (wont_exit, 1);
534 if (exit_mod)
536 opt_info_start_duplication (opt_info);
537 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
538 exit_mod,
539 wont_exit, desc->out_edge,
540 &remove_edges,
541 DLTHE_FLAG_UPDATE_FREQ
542 | (opt_info && exit_mod > 1
543 ? DLTHE_RECORD_COPY_NUMBER
544 : 0));
545 gcc_assert (ok);
547 if (opt_info && exit_mod > 1)
548 apply_opt_in_copies (opt_info, exit_mod, false, false);
550 desc->noloop_assumptions = NULL_RTX;
551 desc->niter -= exit_mod;
552 loop->nb_iterations_upper_bound -= exit_mod;
553 if (loop->any_estimate
554 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
555 loop->nb_iterations_estimate -= exit_mod;
556 else
557 loop->any_estimate = false;
560 bitmap_set_bit (wont_exit, 1);
562 else
564 /* Leave exit test in last copy, for the same reason as above if
565 the loop tests the condition at the end of loop body. */
567 if (dump_file)
568 fprintf (dump_file, ";; Condition at end of loop.\n");
570 /* We know that niter >= max_unroll + 2; so we do not need to care of
571 case when we would exit before reaching the loop. So just peel
572 exit_mod + 1 iterations. */
573 if (exit_mod != max_unroll
574 || desc->noloop_assumptions)
576 bitmap_clear_bit (wont_exit, 0);
577 if (desc->noloop_assumptions)
578 bitmap_clear_bit (wont_exit, 1);
580 opt_info_start_duplication (opt_info);
581 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
582 exit_mod + 1,
583 wont_exit, desc->out_edge,
584 &remove_edges,
585 DLTHE_FLAG_UPDATE_FREQ
586 | (opt_info && exit_mod > 0
587 ? DLTHE_RECORD_COPY_NUMBER
588 : 0));
589 gcc_assert (ok);
591 if (opt_info && exit_mod > 0)
592 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
594 desc->niter -= exit_mod + 1;
595 loop->nb_iterations_upper_bound -= exit_mod + 1;
596 if (loop->any_estimate
597 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
598 loop->nb_iterations_estimate -= exit_mod + 1;
599 else
600 loop->any_estimate = false;
601 desc->noloop_assumptions = NULL_RTX;
603 bitmap_set_bit (wont_exit, 0);
604 bitmap_set_bit (wont_exit, 1);
607 bitmap_clear_bit (wont_exit, max_unroll);
610 /* Now unroll the loop. */
612 opt_info_start_duplication (opt_info);
613 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
614 max_unroll,
615 wont_exit, desc->out_edge,
616 &remove_edges,
617 DLTHE_FLAG_UPDATE_FREQ
618 | (opt_info
619 ? DLTHE_RECORD_COPY_NUMBER
620 : 0));
621 gcc_assert (ok);
623 if (opt_info)
625 apply_opt_in_copies (opt_info, max_unroll, true, true);
626 free_opt_info (opt_info);
629 free (wont_exit);
631 if (exit_at_end)
633 basic_block exit_block = get_bb_copy (desc->in_edge->src);
634 /* Find a new in and out edge; they are in the last copy we have made. */
636 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
638 desc->out_edge = EDGE_SUCC (exit_block, 0);
639 desc->in_edge = EDGE_SUCC (exit_block, 1);
641 else
643 desc->out_edge = EDGE_SUCC (exit_block, 1);
644 desc->in_edge = EDGE_SUCC (exit_block, 0);
648 desc->niter /= max_unroll + 1;
649 loop->nb_iterations_upper_bound
650 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
651 if (loop->any_estimate)
652 loop->nb_iterations_estimate
653 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
654 desc->niter_expr = GEN_INT (desc->niter);
656 /* Remove the edges. */
657 FOR_EACH_VEC_ELT (remove_edges, i, e)
658 remove_path (e);
660 if (dump_file)
661 fprintf (dump_file,
662 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
663 max_unroll, num_loop_insns (loop));
666 /* Decide whether to unroll LOOP iterating runtime computable number of times
667 and how much. */
668 static void
669 decide_unroll_runtime_iterations (struct loop *loop, int flags)
671 unsigned nunroll, nunroll_by_av, i;
672 struct niter_desc *desc;
673 widest_int iterations;
675 if (!(flags & UAP_UNROLL))
677 /* We were not asked to, just return back silently. */
678 return;
681 if (dump_file)
682 fprintf (dump_file,
683 "\n;; Considering unrolling loop with runtime "
684 "computable number of iterations\n");
686 /* nunroll = total number of copies of the original loop body in
687 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
688 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
689 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
690 if (nunroll > nunroll_by_av)
691 nunroll = nunroll_by_av;
692 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
693 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
695 if (targetm.loop_unroll_adjust)
696 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
698 /* Skip big loops. */
699 if (nunroll <= 1)
701 if (dump_file)
702 fprintf (dump_file, ";; Not considering loop, is too big\n");
703 return;
706 /* Check for simple loops. */
707 desc = get_simple_loop_desc (loop);
709 /* Check simpleness. */
710 if (!desc->simple_p || desc->assumptions)
712 if (dump_file)
713 fprintf (dump_file,
714 ";; Unable to prove that the number of iterations "
715 "can be counted in runtime\n");
716 return;
719 if (desc->const_iter)
721 if (dump_file)
722 fprintf (dump_file, ";; Loop iterates constant times\n");
723 return;
726 /* Check whether the loop rolls. */
727 if ((get_estimated_loop_iterations (loop, &iterations)
728 || get_max_loop_iterations (loop, &iterations))
729 && wi::ltu_p (iterations, 2 * nunroll))
731 if (dump_file)
732 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
733 return;
736 /* Success; now force nunroll to be power of 2, as we are unable to
737 cope with overflows in computation of number of iterations. */
738 for (i = 1; 2 * i <= nunroll; i *= 2)
739 continue;
741 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
742 loop->lpt_decision.times = i - 1;
745 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
746 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
747 and NULL is returned instead. */
749 basic_block
750 split_edge_and_insert (edge e, rtx_insn *insns)
752 basic_block bb;
754 if (!insns)
755 return NULL;
756 bb = split_edge (e);
757 emit_insn_after (insns, BB_END (bb));
759 /* ??? We used to assume that INSNS can contain control flow insns, and
760 that we had to try to find sub basic blocks in BB to maintain a valid
761 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
762 and call break_superblocks when going out of cfglayout mode. But it
763 turns out that this never happens; and that if it does ever happen,
764 the verify_flow_info at the end of the RTL loop passes would fail.
766 There are two reasons why we expected we could have control flow insns
767 in INSNS. The first is when a comparison has to be done in parts, and
768 the second is when the number of iterations is computed for loops with
769 the number of iterations known at runtime. In both cases, test cases
770 to get control flow in INSNS appear to be impossible to construct:
772 * If do_compare_rtx_and_jump needs several branches to do comparison
773 in a mode that needs comparison by parts, we cannot analyze the
774 number of iterations of the loop, and we never get to unrolling it.
776 * The code in expand_divmod that was suspected to cause creation of
777 branching code seems to be only accessed for signed division. The
778 divisions used by # of iterations analysis are always unsigned.
779 Problems might arise on architectures that emits branching code
780 for some operations that may appear in the unroller (especially
781 for division), but we have no such architectures.
783 Considering all this, it was decided that we should for now assume
784 that INSNS can in theory contain control flow insns, but in practice
785 it never does. So we don't handle the theoretical case, and should
786 a real failure ever show up, we have a pretty good clue for how to
787 fix it. */
789 return bb;
792 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
793 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
794 in order to create a jump. */
796 static rtx_insn *
797 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
798 rtx_code_label *label, int prob, rtx_insn *cinsn)
800 rtx_insn *seq;
801 rtx_jump_insn *jump;
802 rtx cond;
803 machine_mode mode;
805 mode = GET_MODE (op0);
806 if (mode == VOIDmode)
807 mode = GET_MODE (op1);
809 start_sequence ();
810 if (GET_MODE_CLASS (mode) == MODE_CC)
812 /* A hack -- there seems to be no easy generic way how to make a
813 conditional jump from a ccmode comparison. */
814 gcc_assert (cinsn);
815 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
816 gcc_assert (GET_CODE (cond) == comp);
817 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
818 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
819 emit_jump_insn (copy_insn (PATTERN (cinsn)));
820 jump = as_a <rtx_jump_insn *> (get_last_insn ());
821 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
822 LABEL_NUSES (JUMP_LABEL (jump))++;
823 redirect_jump (jump, label, 0);
825 else
827 gcc_assert (!cinsn);
829 op0 = force_operand (op0, NULL_RTX);
830 op1 = force_operand (op1, NULL_RTX);
831 do_compare_rtx_and_jump (op0, op1, comp, 0,
832 mode, NULL_RTX, NULL, label, -1);
833 jump = as_a <rtx_jump_insn *> (get_last_insn ());
834 jump->set_jump_target (label);
835 LABEL_NUSES (label)++;
837 add_int_reg_note (jump, REG_BR_PROB, prob);
839 seq = get_insns ();
840 end_sequence ();
842 return seq;
845 /* Unroll LOOP for which we are able to count number of iterations in runtime
846 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
847 extra care for case n < 0):
849 for (i = 0; i < n; i++)
850 body;
852 ==> (LOOP->LPT_DECISION.TIMES == 3)
854 i = 0;
855 mod = n % 4;
857 switch (mod)
859 case 3:
860 body; i++;
861 case 2:
862 body; i++;
863 case 1:
864 body; i++;
865 case 0: ;
868 while (i < n)
870 body; i++;
871 body; i++;
872 body; i++;
873 body; i++;
876 static void
877 unroll_loop_runtime_iterations (struct loop *loop)
879 rtx old_niter, niter, tmp;
880 rtx_insn *init_code, *branch_code;
881 unsigned i, j, p;
882 basic_block preheader, *body, swtch, ezc_swtch;
883 sbitmap wont_exit;
884 int may_exit_copy;
885 unsigned n_peel;
886 edge e;
887 bool extra_zero_check, last_may_exit;
888 unsigned max_unroll = loop->lpt_decision.times;
889 struct niter_desc *desc = get_simple_loop_desc (loop);
890 bool exit_at_end = loop_exit_at_end_p (loop);
891 struct opt_info *opt_info = NULL;
892 bool ok;
894 if (flag_split_ivs_in_unroller
895 || flag_variable_expansion_in_unroller)
896 opt_info = analyze_insns_in_loop (loop);
898 /* Remember blocks whose dominators will have to be updated. */
899 auto_vec<basic_block> dom_bbs;
901 body = get_loop_body (loop);
902 for (i = 0; i < loop->num_nodes; i++)
904 vec<basic_block> ldom;
905 basic_block bb;
907 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
908 FOR_EACH_VEC_ELT (ldom, j, bb)
909 if (!flow_bb_inside_loop_p (loop, bb))
910 dom_bbs.safe_push (bb);
912 ldom.release ();
914 free (body);
916 if (!exit_at_end)
918 /* Leave exit in first copy (for explanation why see comment in
919 unroll_loop_constant_iterations). */
920 may_exit_copy = 0;
921 n_peel = max_unroll - 1;
922 extra_zero_check = true;
923 last_may_exit = false;
925 else
927 /* Leave exit in last copy (for explanation why see comment in
928 unroll_loop_constant_iterations). */
929 may_exit_copy = max_unroll;
930 n_peel = max_unroll;
931 extra_zero_check = false;
932 last_may_exit = true;
935 /* Get expression for number of iterations. */
936 start_sequence ();
937 old_niter = niter = gen_reg_rtx (desc->mode);
938 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
939 if (tmp != niter)
940 emit_move_insn (niter, tmp);
942 /* Count modulo by ANDing it with max_unroll; we use the fact that
943 the number of unrollings is a power of two, and thus this is correct
944 even if there is overflow in the computation. */
945 niter = expand_simple_binop (desc->mode, AND,
946 niter, gen_int_mode (max_unroll, desc->mode),
947 NULL_RTX, 0, OPTAB_LIB_WIDEN);
949 init_code = get_insns ();
950 end_sequence ();
951 unshare_all_rtl_in_chain (init_code);
953 /* Precondition the loop. */
954 split_edge_and_insert (loop_preheader_edge (loop), init_code);
956 auto_vec<edge> remove_edges;
958 wont_exit = sbitmap_alloc (max_unroll + 2);
960 /* Peel the first copy of loop body (almost always we must leave exit test
961 here; the only exception is when we have extra zero check and the number
962 of iterations is reliable. Also record the place of (possible) extra
963 zero check. */
964 bitmap_clear (wont_exit);
965 if (extra_zero_check
966 && !desc->noloop_assumptions)
967 bitmap_set_bit (wont_exit, 1);
968 ezc_swtch = loop_preheader_edge (loop)->src;
969 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
970 1, wont_exit, desc->out_edge,
971 &remove_edges,
972 DLTHE_FLAG_UPDATE_FREQ);
973 gcc_assert (ok);
975 /* Record the place where switch will be built for preconditioning. */
976 swtch = split_edge (loop_preheader_edge (loop));
978 for (i = 0; i < n_peel; i++)
980 /* Peel the copy. */
981 bitmap_clear (wont_exit);
982 if (i != n_peel - 1 || !last_may_exit)
983 bitmap_set_bit (wont_exit, 1);
984 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
985 1, wont_exit, desc->out_edge,
986 &remove_edges,
987 DLTHE_FLAG_UPDATE_FREQ);
988 gcc_assert (ok);
990 /* Create item for switch. */
991 j = n_peel - i - (extra_zero_check ? 0 : 1);
992 p = REG_BR_PROB_BASE / (i + 2);
994 preheader = split_edge (loop_preheader_edge (loop));
995 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
996 block_label (preheader), p,
997 NULL);
999 /* We rely on the fact that the compare and jump cannot be optimized out,
1000 and hence the cfg we create is correct. */
1001 gcc_assert (branch_code != NULL_RTX);
1003 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1004 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1005 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1006 e = make_edge (swtch, preheader,
1007 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1008 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1009 e->probability = p;
1012 if (extra_zero_check)
1014 /* Add branch for zero iterations. */
1015 p = REG_BR_PROB_BASE / (max_unroll + 1);
1016 swtch = ezc_swtch;
1017 preheader = split_edge (loop_preheader_edge (loop));
1018 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1019 block_label (preheader), p,
1020 NULL);
1021 gcc_assert (branch_code != NULL_RTX);
1023 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1024 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1025 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1026 e = make_edge (swtch, preheader,
1027 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1028 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1029 e->probability = p;
1032 /* Recount dominators for outer blocks. */
1033 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1035 /* And unroll loop. */
1037 bitmap_ones (wont_exit);
1038 bitmap_clear_bit (wont_exit, may_exit_copy);
1039 opt_info_start_duplication (opt_info);
1041 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1042 max_unroll,
1043 wont_exit, desc->out_edge,
1044 &remove_edges,
1045 DLTHE_FLAG_UPDATE_FREQ
1046 | (opt_info
1047 ? DLTHE_RECORD_COPY_NUMBER
1048 : 0));
1049 gcc_assert (ok);
1051 if (opt_info)
1053 apply_opt_in_copies (opt_info, max_unroll, true, true);
1054 free_opt_info (opt_info);
1057 free (wont_exit);
1059 if (exit_at_end)
1061 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1062 /* Find a new in and out edge; they are in the last copy we have
1063 made. */
1065 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1067 desc->out_edge = EDGE_SUCC (exit_block, 0);
1068 desc->in_edge = EDGE_SUCC (exit_block, 1);
1070 else
1072 desc->out_edge = EDGE_SUCC (exit_block, 1);
1073 desc->in_edge = EDGE_SUCC (exit_block, 0);
1077 /* Remove the edges. */
1078 FOR_EACH_VEC_ELT (remove_edges, i, e)
1079 remove_path (e);
1081 /* We must be careful when updating the number of iterations due to
1082 preconditioning and the fact that the value must be valid at entry
1083 of the loop. After passing through the above code, we see that
1084 the correct new number of iterations is this: */
1085 gcc_assert (!desc->const_iter);
1086 desc->niter_expr =
1087 simplify_gen_binary (UDIV, desc->mode, old_niter,
1088 gen_int_mode (max_unroll + 1, desc->mode));
1089 loop->nb_iterations_upper_bound
1090 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1091 if (loop->any_estimate)
1092 loop->nb_iterations_estimate
1093 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1094 if (exit_at_end)
1096 desc->niter_expr =
1097 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1098 desc->noloop_assumptions = NULL_RTX;
1099 --loop->nb_iterations_upper_bound;
1100 if (loop->any_estimate
1101 && loop->nb_iterations_estimate != 0)
1102 --loop->nb_iterations_estimate;
1103 else
1104 loop->any_estimate = false;
1107 if (dump_file)
1108 fprintf (dump_file,
1109 ";; Unrolled loop %d times, counting # of iterations "
1110 "in runtime, %i insns\n",
1111 max_unroll, num_loop_insns (loop));
1114 /* Decide whether to unroll LOOP stupidly and how much. */
1115 static void
1116 decide_unroll_stupid (struct loop *loop, int flags)
1118 unsigned nunroll, nunroll_by_av, i;
1119 struct niter_desc *desc;
1120 widest_int iterations;
1122 if (!(flags & UAP_UNROLL_ALL))
1124 /* We were not asked to, just return back silently. */
1125 return;
1128 if (dump_file)
1129 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1131 /* nunroll = total number of copies of the original loop body in
1132 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1133 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1134 nunroll_by_av
1135 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1136 if (nunroll > nunroll_by_av)
1137 nunroll = nunroll_by_av;
1138 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1139 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1141 if (targetm.loop_unroll_adjust)
1142 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1144 /* Skip big loops. */
1145 if (nunroll <= 1)
1147 if (dump_file)
1148 fprintf (dump_file, ";; Not considering loop, is too big\n");
1149 return;
1152 /* Check for simple loops. */
1153 desc = get_simple_loop_desc (loop);
1155 /* Check simpleness. */
1156 if (desc->simple_p && !desc->assumptions)
1158 if (dump_file)
1159 fprintf (dump_file, ";; The loop is simple\n");
1160 return;
1163 /* Do not unroll loops with branches inside -- it increases number
1164 of mispredicts.
1165 TODO: this heuristic needs tunning; call inside the loop body
1166 is also relatively good reason to not unroll. */
1167 if (num_loop_branches (loop) > 1)
1169 if (dump_file)
1170 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1171 return;
1174 /* Check whether the loop rolls. */
1175 if ((get_estimated_loop_iterations (loop, &iterations)
1176 || get_max_loop_iterations (loop, &iterations))
1177 && wi::ltu_p (iterations, 2 * nunroll))
1179 if (dump_file)
1180 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1181 return;
1184 /* Success. Now force nunroll to be power of 2, as it seems that this
1185 improves results (partially because of better alignments, partially
1186 because of some dark magic). */
1187 for (i = 1; 2 * i <= nunroll; i *= 2)
1188 continue;
1190 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1191 loop->lpt_decision.times = i - 1;
1194 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1196 while (cond)
1197 body;
1199 ==> (LOOP->LPT_DECISION.TIMES == 3)
1201 while (cond)
1203 body;
1204 if (!cond) break;
1205 body;
1206 if (!cond) break;
1207 body;
1208 if (!cond) break;
1209 body;
1212 static void
1213 unroll_loop_stupid (struct loop *loop)
1215 sbitmap wont_exit;
1216 unsigned nunroll = loop->lpt_decision.times;
1217 struct niter_desc *desc = get_simple_loop_desc (loop);
1218 struct opt_info *opt_info = NULL;
1219 bool ok;
1221 if (flag_split_ivs_in_unroller
1222 || flag_variable_expansion_in_unroller)
1223 opt_info = analyze_insns_in_loop (loop);
1226 wont_exit = sbitmap_alloc (nunroll + 1);
1227 bitmap_clear (wont_exit);
1228 opt_info_start_duplication (opt_info);
1230 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1231 nunroll, wont_exit,
1232 NULL, NULL,
1233 DLTHE_FLAG_UPDATE_FREQ
1234 | (opt_info
1235 ? DLTHE_RECORD_COPY_NUMBER
1236 : 0));
1237 gcc_assert (ok);
1239 if (opt_info)
1241 apply_opt_in_copies (opt_info, nunroll, true, true);
1242 free_opt_info (opt_info);
1245 free (wont_exit);
1247 if (desc->simple_p)
1249 /* We indeed may get here provided that there are nontrivial assumptions
1250 for a loop to be really simple. We could update the counts, but the
1251 problem is that we are unable to decide which exit will be taken
1252 (not really true in case the number of iterations is constant,
1253 but no one will do anything with this information, so we do not
1254 worry about it). */
1255 desc->simple_p = false;
1258 if (dump_file)
1259 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1260 nunroll, num_loop_insns (loop));
1263 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1264 Set *DEBUG_USES to the number of debug insns that reference the
1265 variable. */
1267 static bool
1268 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1269 int *debug_uses)
1271 basic_block *body, bb;
1272 unsigned i;
1273 int count_ref = 0;
1274 rtx_insn *insn;
1276 body = get_loop_body (loop);
1277 for (i = 0; i < loop->num_nodes; i++)
1279 bb = body[i];
1281 FOR_BB_INSNS (bb, insn)
1282 if (!rtx_referenced_p (reg, insn))
1283 continue;
1284 else if (DEBUG_INSN_P (insn))
1285 ++*debug_uses;
1286 else if (++count_ref > 1)
1287 break;
1289 free (body);
1290 return (count_ref == 1);
1293 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1295 static void
1296 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1298 basic_block *body, bb;
1299 unsigned i;
1300 rtx_insn *insn;
1302 body = get_loop_body (loop);
1303 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1305 bb = body[i];
1307 FOR_BB_INSNS (bb, insn)
1308 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1309 continue;
1310 else
1312 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1313 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1314 if (!--debug_uses)
1315 break;
1318 free (body);
1321 /* Determine whether INSN contains an accumulator
1322 which can be expanded into separate copies,
1323 one for each copy of the LOOP body.
1325 for (i = 0 ; i < n; i++)
1326 sum += a[i];
1330 sum += a[i]
1331 ....
1332 i = i+1;
1333 sum1 += a[i]
1334 ....
1335 i = i+1
1336 sum2 += a[i];
1337 ....
1339 Return NULL if INSN contains no opportunity for expansion of accumulator.
1340 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1341 information and return a pointer to it.
1344 static struct var_to_expand *
1345 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1347 rtx set, dest, src;
1348 struct var_to_expand *ves;
1349 unsigned accum_pos;
1350 enum rtx_code code;
1351 int debug_uses = 0;
1353 set = single_set (insn);
1354 if (!set)
1355 return NULL;
1357 dest = SET_DEST (set);
1358 src = SET_SRC (set);
1359 code = GET_CODE (src);
1361 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1362 return NULL;
1364 if (FLOAT_MODE_P (GET_MODE (dest)))
1366 if (!flag_associative_math)
1367 return NULL;
1368 /* In the case of FMA, we're also changing the rounding. */
1369 if (code == FMA && !flag_unsafe_math_optimizations)
1370 return NULL;
1373 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1374 in MD. But if there is no optab to generate the insn, we can not
1375 perform the variable expansion. This can happen if an MD provides
1376 an insn but not a named pattern to generate it, for example to avoid
1377 producing code that needs additional mode switches like for x87/mmx.
1379 So we check have_insn_for which looks for an optab for the operation
1380 in SRC. If it doesn't exist, we can't perform the expansion even
1381 though INSN is valid. */
1382 if (!have_insn_for (code, GET_MODE (src)))
1383 return NULL;
1385 if (!REG_P (dest)
1386 && !(GET_CODE (dest) == SUBREG
1387 && REG_P (SUBREG_REG (dest))))
1388 return NULL;
1390 /* Find the accumulator use within the operation. */
1391 if (code == FMA)
1393 /* We only support accumulation via FMA in the ADD position. */
1394 if (!rtx_equal_p (dest, XEXP (src, 2)))
1395 return NULL;
1396 accum_pos = 2;
1398 else if (rtx_equal_p (dest, XEXP (src, 0)))
1399 accum_pos = 0;
1400 else if (rtx_equal_p (dest, XEXP (src, 1)))
1402 /* The method of expansion that we are using; which includes the
1403 initialization of the expansions with zero and the summation of
1404 the expansions at the end of the computation will yield wrong
1405 results for (x = something - x) thus avoid using it in that case. */
1406 if (code == MINUS)
1407 return NULL;
1408 accum_pos = 1;
1410 else
1411 return NULL;
1413 /* It must not otherwise be used. */
1414 if (code == FMA)
1416 if (rtx_referenced_p (dest, XEXP (src, 0))
1417 || rtx_referenced_p (dest, XEXP (src, 1)))
1418 return NULL;
1420 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1421 return NULL;
1423 /* It must be used in exactly one insn. */
1424 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1425 return NULL;
1427 if (dump_file)
1429 fprintf (dump_file, "\n;; Expanding Accumulator ");
1430 print_rtl (dump_file, dest);
1431 fprintf (dump_file, "\n");
1434 if (debug_uses)
1435 /* Instead of resetting the debug insns, we could replace each
1436 debug use in the loop with the sum or product of all expanded
1437 accummulators. Since we'll only know of all expansions at the
1438 end, we'd have to keep track of which vars_to_expand a debug
1439 insn in the loop references, take note of each copy of the
1440 debug insn during unrolling, and when it's all done, compute
1441 the sum or product of each variable and adjust the original
1442 debug insn and each copy thereof. What a pain! */
1443 reset_debug_uses_in_loop (loop, dest, debug_uses);
1445 /* Record the accumulator to expand. */
1446 ves = XNEW (struct var_to_expand);
1447 ves->insn = insn;
1448 ves->reg = copy_rtx (dest);
1449 ves->var_expansions.create (1);
1450 ves->next = NULL;
1451 ves->op = GET_CODE (src);
1452 ves->expansion_count = 0;
1453 ves->reuse_expansion = 0;
1454 return ves;
1457 /* Determine whether there is an induction variable in INSN that
1458 we would like to split during unrolling.
1460 I.e. replace
1462 i = i + 1;
1464 i = i + 1;
1466 i = i + 1;
1469 type chains by
1471 i0 = i + 1
1473 i = i0 + 1
1475 i = i0 + 2
1478 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1479 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1480 pointer to it. */
1482 static struct iv_to_split *
1483 analyze_iv_to_split_insn (rtx_insn *insn)
1485 rtx set, dest;
1486 struct rtx_iv iv;
1487 struct iv_to_split *ivts;
1488 bool ok;
1490 /* For now we just split the basic induction variables. Later this may be
1491 extended for example by selecting also addresses of memory references. */
1492 set = single_set (insn);
1493 if (!set)
1494 return NULL;
1496 dest = SET_DEST (set);
1497 if (!REG_P (dest))
1498 return NULL;
1500 if (!biv_p (insn, dest))
1501 return NULL;
1503 ok = iv_analyze_result (insn, dest, &iv);
1505 /* This used to be an assert under the assumption that if biv_p returns
1506 true that iv_analyze_result must also return true. However, that
1507 assumption is not strictly correct as evidenced by pr25569.
1509 Returning NULL when iv_analyze_result returns false is safe and
1510 avoids the problems in pr25569 until the iv_analyze_* routines
1511 can be fixed, which is apparently hard and time consuming
1512 according to their author. */
1513 if (! ok)
1514 return NULL;
1516 if (iv.step == const0_rtx
1517 || iv.mode != iv.extend_mode)
1518 return NULL;
1520 /* Record the insn to split. */
1521 ivts = XNEW (struct iv_to_split);
1522 ivts->insn = insn;
1523 ivts->orig_var = dest;
1524 ivts->base_var = NULL_RTX;
1525 ivts->step = iv.step;
1526 ivts->next = NULL;
1528 return ivts;
1531 /* Determines which of insns in LOOP can be optimized.
1532 Return a OPT_INFO struct with the relevant hash tables filled
1533 with all insns to be optimized. The FIRST_NEW_BLOCK field
1534 is undefined for the return value. */
1536 static struct opt_info *
1537 analyze_insns_in_loop (struct loop *loop)
1539 basic_block *body, bb;
1540 unsigned i;
1541 struct opt_info *opt_info = XCNEW (struct opt_info);
1542 rtx_insn *insn;
1543 struct iv_to_split *ivts = NULL;
1544 struct var_to_expand *ves = NULL;
1545 iv_to_split **slot1;
1546 var_to_expand **slot2;
1547 vec<edge> edges = get_loop_exit_edges (loop);
1548 edge exit;
1549 bool can_apply = false;
1551 iv_analysis_loop_init (loop);
1553 body = get_loop_body (loop);
1555 if (flag_split_ivs_in_unroller)
1557 opt_info->insns_to_split
1558 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1559 opt_info->iv_to_split_head = NULL;
1560 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1563 /* Record the loop exit bb and loop preheader before the unrolling. */
1564 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1566 if (edges.length () == 1)
1568 exit = edges[0];
1569 if (!(exit->flags & EDGE_COMPLEX))
1571 opt_info->loop_exit = split_edge (exit);
1572 can_apply = true;
1576 if (flag_variable_expansion_in_unroller
1577 && can_apply)
1579 opt_info->insns_with_var_to_expand
1580 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1581 opt_info->var_to_expand_head = NULL;
1582 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1585 for (i = 0; i < loop->num_nodes; i++)
1587 bb = body[i];
1588 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1589 continue;
1591 FOR_BB_INSNS (bb, insn)
1593 if (!INSN_P (insn))
1594 continue;
1596 if (opt_info->insns_to_split)
1597 ivts = analyze_iv_to_split_insn (insn);
1599 if (ivts)
1601 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1602 gcc_assert (*slot1 == NULL);
1603 *slot1 = ivts;
1604 *opt_info->iv_to_split_tail = ivts;
1605 opt_info->iv_to_split_tail = &ivts->next;
1606 continue;
1609 if (opt_info->insns_with_var_to_expand)
1610 ves = analyze_insn_to_expand_var (loop, insn);
1612 if (ves)
1614 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1615 gcc_assert (*slot2 == NULL);
1616 *slot2 = ves;
1617 *opt_info->var_to_expand_tail = ves;
1618 opt_info->var_to_expand_tail = &ves->next;
1623 edges.release ();
1624 free (body);
1625 return opt_info;
1628 /* Called just before loop duplication. Records start of duplicated area
1629 to OPT_INFO. */
1631 static void
1632 opt_info_start_duplication (struct opt_info *opt_info)
1634 if (opt_info)
1635 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1638 /* Determine the number of iterations between initialization of the base
1639 variable and the current copy (N_COPY). N_COPIES is the total number
1640 of newly created copies. UNROLLING is true if we are unrolling
1641 (not peeling) the loop. */
1643 static unsigned
1644 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1646 if (unrolling)
1648 /* If we are unrolling, initialization is done in the original loop
1649 body (number 0). */
1650 return n_copy;
1652 else
1654 /* If we are peeling, the copy in that the initialization occurs has
1655 number 1. The original loop (number 0) is the last. */
1656 if (n_copy)
1657 return n_copy - 1;
1658 else
1659 return n_copies;
1663 /* Allocate basic variable for the induction variable chain. */
1665 static void
1666 allocate_basic_variable (struct iv_to_split *ivts)
1668 rtx expr = SET_SRC (single_set (ivts->insn));
1670 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1673 /* Insert initialization of basic variable of IVTS before INSN, taking
1674 the initial value from INSN. */
1676 static void
1677 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1679 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1680 rtx_insn *seq;
1682 start_sequence ();
1683 expr = force_operand (expr, ivts->base_var);
1684 if (expr != ivts->base_var)
1685 emit_move_insn (ivts->base_var, expr);
1686 seq = get_insns ();
1687 end_sequence ();
1689 emit_insn_before (seq, insn);
1692 /* Replace the use of induction variable described in IVTS in INSN
1693 by base variable + DELTA * step. */
1695 static void
1696 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1698 rtx expr, *loc, incr, var;
1699 rtx_insn *seq;
1700 machine_mode mode = GET_MODE (ivts->base_var);
1701 rtx src, dest, set;
1703 /* Construct base + DELTA * step. */
1704 if (!delta)
1705 expr = ivts->base_var;
1706 else
1708 incr = simplify_gen_binary (MULT, mode,
1709 ivts->step, gen_int_mode (delta, mode));
1710 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1711 ivts->base_var, incr);
1714 /* Figure out where to do the replacement. */
1715 loc = &SET_SRC (single_set (insn));
1717 /* If we can make the replacement right away, we're done. */
1718 if (validate_change (insn, loc, expr, 0))
1719 return;
1721 /* Otherwise, force EXPR into a register and try again. */
1722 start_sequence ();
1723 var = gen_reg_rtx (mode);
1724 expr = force_operand (expr, var);
1725 if (expr != var)
1726 emit_move_insn (var, expr);
1727 seq = get_insns ();
1728 end_sequence ();
1729 emit_insn_before (seq, insn);
1731 if (validate_change (insn, loc, var, 0))
1732 return;
1734 /* The last chance. Try recreating the assignment in insn
1735 completely from scratch. */
1736 set = single_set (insn);
1737 gcc_assert (set);
1739 start_sequence ();
1740 *loc = var;
1741 src = copy_rtx (SET_SRC (set));
1742 dest = copy_rtx (SET_DEST (set));
1743 src = force_operand (src, dest);
1744 if (src != dest)
1745 emit_move_insn (dest, src);
1746 seq = get_insns ();
1747 end_sequence ();
1749 emit_insn_before (seq, insn);
1750 delete_insn (insn);
1754 /* Return one expansion of the accumulator recorded in struct VE. */
1756 static rtx
1757 get_expansion (struct var_to_expand *ve)
1759 rtx reg;
1761 if (ve->reuse_expansion == 0)
1762 reg = ve->reg;
1763 else
1764 reg = ve->var_expansions[ve->reuse_expansion - 1];
1766 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1767 ve->reuse_expansion = 0;
1768 else
1769 ve->reuse_expansion++;
1771 return reg;
1775 /* Given INSN replace the uses of the accumulator recorded in VE
1776 with a new register. */
1778 static void
1779 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1781 rtx new_reg, set;
1782 bool really_new_expansion = false;
1784 set = single_set (insn);
1785 gcc_assert (set);
1787 /* Generate a new register only if the expansion limit has not been
1788 reached. Else reuse an already existing expansion. */
1789 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1791 really_new_expansion = true;
1792 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1794 else
1795 new_reg = get_expansion (ve);
1797 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1798 if (apply_change_group ())
1799 if (really_new_expansion)
1801 ve->var_expansions.safe_push (new_reg);
1802 ve->expansion_count++;
1806 /* Initialize the variable expansions in loop preheader. PLACE is the
1807 loop-preheader basic block where the initialization of the
1808 expansions should take place. The expansions are initialized with
1809 (-0) when the operation is plus or minus to honor sign zero. This
1810 way we can prevent cases where the sign of the final result is
1811 effected by the sign of the expansion. Here is an example to
1812 demonstrate this:
1814 for (i = 0 ; i < n; i++)
1815 sum += something;
1819 sum += something
1820 ....
1821 i = i+1;
1822 sum1 += something
1823 ....
1824 i = i+1
1825 sum2 += something;
1826 ....
1828 When SUM is initialized with -zero and SOMETHING is also -zero; the
1829 final result of sum should be -zero thus the expansions sum1 and sum2
1830 should be initialized with -zero as well (otherwise we will get +zero
1831 as the final result). */
1833 static void
1834 insert_var_expansion_initialization (struct var_to_expand *ve,
1835 basic_block place)
1837 rtx_insn *seq;
1838 rtx var, zero_init;
1839 unsigned i;
1840 machine_mode mode = GET_MODE (ve->reg);
1841 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1843 if (ve->var_expansions.length () == 0)
1844 return;
1846 start_sequence ();
1847 switch (ve->op)
1849 case FMA:
1850 /* Note that we only accumulate FMA via the ADD operand. */
1851 case PLUS:
1852 case MINUS:
1853 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1855 if (honor_signed_zero_p)
1856 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1857 else
1858 zero_init = CONST0_RTX (mode);
1859 emit_move_insn (var, zero_init);
1861 break;
1863 case MULT:
1864 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1866 zero_init = CONST1_RTX (GET_MODE (var));
1867 emit_move_insn (var, zero_init);
1869 break;
1871 default:
1872 gcc_unreachable ();
1875 seq = get_insns ();
1876 end_sequence ();
1878 emit_insn_after (seq, BB_END (place));
1881 /* Combine the variable expansions at the loop exit. PLACE is the
1882 loop exit basic block where the summation of the expansions should
1883 take place. */
1885 static void
1886 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1888 rtx sum = ve->reg;
1889 rtx expr, var;
1890 rtx_insn *seq, *insn;
1891 unsigned i;
1893 if (ve->var_expansions.length () == 0)
1894 return;
1896 start_sequence ();
1897 switch (ve->op)
1899 case FMA:
1900 /* Note that we only accumulate FMA via the ADD operand. */
1901 case PLUS:
1902 case MINUS:
1903 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1904 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1905 break;
1907 case MULT:
1908 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1909 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1910 break;
1912 default:
1913 gcc_unreachable ();
1916 expr = force_operand (sum, ve->reg);
1917 if (expr != ve->reg)
1918 emit_move_insn (ve->reg, expr);
1919 seq = get_insns ();
1920 end_sequence ();
1922 insn = BB_HEAD (place);
1923 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1924 insn = NEXT_INSN (insn);
1926 emit_insn_after (seq, insn);
1929 /* Strip away REG_EQUAL notes for IVs we're splitting.
1931 Updating REG_EQUAL notes for IVs we split is tricky: We
1932 cannot tell until after unrolling, DF-rescanning, and liveness
1933 updating, whether an EQ_USE is reached by the split IV while
1934 the IV reg is still live. See PR55006.
1936 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1937 because RTL loop-iv requires us to defer rescanning insns and
1938 any notes attached to them. So resort to old techniques... */
1940 static void
1941 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1943 struct iv_to_split *ivts;
1944 rtx note = find_reg_equal_equiv_note (insn);
1945 if (! note)
1946 return;
1947 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1948 if (reg_mentioned_p (ivts->orig_var, note))
1950 remove_note (insn, note);
1951 return;
1955 /* Apply loop optimizations in loop copies using the
1956 data which gathered during the unrolling. Structure
1957 OPT_INFO record that data.
1959 UNROLLING is true if we unrolled (not peeled) the loop.
1960 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1961 the loop (as it should happen in complete unrolling, but not in ordinary
1962 peeling of the loop). */
1964 static void
1965 apply_opt_in_copies (struct opt_info *opt_info,
1966 unsigned n_copies, bool unrolling,
1967 bool rewrite_original_loop)
1969 unsigned i, delta;
1970 basic_block bb, orig_bb;
1971 rtx_insn *insn, *orig_insn, *next;
1972 struct iv_to_split ivts_templ, *ivts;
1973 struct var_to_expand ve_templ, *ves;
1975 /* Sanity check -- we need to put initialization in the original loop
1976 body. */
1977 gcc_assert (!unrolling || rewrite_original_loop);
1979 /* Allocate the basic variables (i0). */
1980 if (opt_info->insns_to_split)
1981 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1982 allocate_basic_variable (ivts);
1984 for (i = opt_info->first_new_block;
1985 i < (unsigned) last_basic_block_for_fn (cfun);
1986 i++)
1988 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1989 orig_bb = get_bb_original (bb);
1991 /* bb->aux holds position in copy sequence initialized by
1992 duplicate_loop_to_header_edge. */
1993 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1994 unrolling);
1995 bb->aux = 0;
1996 orig_insn = BB_HEAD (orig_bb);
1997 FOR_BB_INSNS_SAFE (bb, insn, next)
1999 if (!INSN_P (insn)
2000 || (DEBUG_INSN_P (insn)
2001 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2002 continue;
2004 while (!INSN_P (orig_insn)
2005 || (DEBUG_INSN_P (orig_insn)
2006 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2007 == LABEL_DECL)))
2008 orig_insn = NEXT_INSN (orig_insn);
2010 ivts_templ.insn = orig_insn;
2011 ve_templ.insn = orig_insn;
2013 /* Apply splitting iv optimization. */
2014 if (opt_info->insns_to_split)
2016 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2018 ivts = opt_info->insns_to_split->find (&ivts_templ);
2020 if (ivts)
2022 gcc_assert (GET_CODE (PATTERN (insn))
2023 == GET_CODE (PATTERN (orig_insn)));
2025 if (!delta)
2026 insert_base_initialization (ivts, insn);
2027 split_iv (ivts, insn, delta);
2030 /* Apply variable expansion optimization. */
2031 if (unrolling && opt_info->insns_with_var_to_expand)
2033 ves = (struct var_to_expand *)
2034 opt_info->insns_with_var_to_expand->find (&ve_templ);
2035 if (ves)
2037 gcc_assert (GET_CODE (PATTERN (insn))
2038 == GET_CODE (PATTERN (orig_insn)));
2039 expand_var_during_unrolling (ves, insn);
2042 orig_insn = NEXT_INSN (orig_insn);
2046 if (!rewrite_original_loop)
2047 return;
2049 /* Initialize the variable expansions in the loop preheader
2050 and take care of combining them at the loop exit. */
2051 if (opt_info->insns_with_var_to_expand)
2053 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2054 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2055 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2056 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2059 /* Rewrite also the original loop body. Find them as originals of the blocks
2060 in the last copied iteration, i.e. those that have
2061 get_bb_copy (get_bb_original (bb)) == bb. */
2062 for (i = opt_info->first_new_block;
2063 i < (unsigned) last_basic_block_for_fn (cfun);
2064 i++)
2066 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2067 orig_bb = get_bb_original (bb);
2068 if (get_bb_copy (orig_bb) != bb)
2069 continue;
2071 delta = determine_split_iv_delta (0, n_copies, unrolling);
2072 for (orig_insn = BB_HEAD (orig_bb);
2073 orig_insn != NEXT_INSN (BB_END (bb));
2074 orig_insn = next)
2076 next = NEXT_INSN (orig_insn);
2078 if (!INSN_P (orig_insn))
2079 continue;
2081 ivts_templ.insn = orig_insn;
2082 if (opt_info->insns_to_split)
2084 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2086 ivts = (struct iv_to_split *)
2087 opt_info->insns_to_split->find (&ivts_templ);
2088 if (ivts)
2090 if (!delta)
2091 insert_base_initialization (ivts, orig_insn);
2092 split_iv (ivts, orig_insn, delta);
2093 continue;
2101 /* Release OPT_INFO. */
2103 static void
2104 free_opt_info (struct opt_info *opt_info)
2106 delete opt_info->insns_to_split;
2107 opt_info->insns_to_split = NULL;
2108 if (opt_info->insns_with_var_to_expand)
2110 struct var_to_expand *ves;
2112 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2113 ves->var_expansions.release ();
2114 delete opt_info->insns_with_var_to_expand;
2115 opt_info->insns_with_var_to_expand = NULL;
2117 free (opt_info);