2015-06-24 François Dumont <fdumont@gcc.gnu.org>
[official-gcc.git] / gcc / loop-unroll.c
blobe2e2f23f95ece435a6ad243904ff713692f2223b
1 /* Loop unrolling.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "alias.h"
26 #include "symtab.h"
27 #include "tree.h"
28 #include "hard-reg-set.h"
29 #include "obstack.h"
30 #include "profile.h"
31 #include "predict.h"
32 #include "function.h"
33 #include "dominance.h"
34 #include "cfg.h"
35 #include "cfgrtl.h"
36 #include "basic-block.h"
37 #include "cfgloop.h"
38 #include "params.h"
39 #include "insn-codes.h"
40 #include "optabs.h"
41 #include "flags.h"
42 #include "insn-config.h"
43 #include "expmed.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "emit-rtl.h"
48 #include "varasm.h"
49 #include "stmt.h"
50 #include "expr.h"
51 #include "recog.h"
52 #include "target.h"
53 #include "dumpfile.h"
55 /* This pass performs loop unrolling. We only perform this
56 optimization on innermost loops (with single exception) because
57 the impact on performance is greatest here, and we want to avoid
58 unnecessary code size growth. The gain is caused by greater sequentiality
59 of code, better code to optimize for further passes and in some cases
60 by fewer testings of exit conditions. The main problem is code growth,
61 that impacts performance negatively due to effect of caches.
63 What we do:
65 -- unrolling of loops that roll constant times; this is almost always
66 win, as we get rid of exit condition tests.
67 -- unrolling of loops that roll number of times that we can compute
68 in runtime; we also get rid of exit condition tests here, but there
69 is the extra expense for calculating the number of iterations
70 -- simple unrolling of remaining loops; this is performed only if we
71 are asked to, as the gain is questionable in this case and often
72 it may even slow down the code
73 For more detailed descriptions of each of those, see comments at
74 appropriate function below.
76 There is a lot of parameters (defined and described in params.def) that
77 control how much we unroll.
79 ??? A great problem is that we don't have a good way how to determine
80 how many times we should unroll the loop; the experiments I have made
81 showed that this choice may affect performance in order of several %.
84 /* Information about induction variables to split. */
86 struct iv_to_split
88 rtx_insn *insn; /* The insn in that the induction variable occurs. */
89 rtx orig_var; /* The variable (register) for the IV before split. */
90 rtx base_var; /* The variable on that the values in the further
91 iterations are based. */
92 rtx step; /* Step of the induction variable. */
93 struct iv_to_split *next; /* Next entry in walking order. */
96 /* Information about accumulators to expand. */
98 struct var_to_expand
100 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
101 rtx reg; /* The accumulator which is expanded. */
102 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
103 struct var_to_expand *next; /* Next entry in walking order. */
104 enum rtx_code op; /* The type of the accumulation - addition, subtraction
105 or multiplication. */
106 int expansion_count; /* Count the number of expansions generated so far. */
107 int reuse_expansion; /* The expansion we intend to reuse to expand
108 the accumulator. If REUSE_EXPANSION is 0 reuse
109 the original accumulator. Else use
110 var_expansions[REUSE_EXPANSION - 1]. */
113 /* Hashtable helper for iv_to_split. */
115 struct iv_split_hasher : typed_free_remove <iv_to_split>
117 typedef iv_to_split *value_type;
118 typedef iv_to_split *compare_type;
119 static inline hashval_t hash (const iv_to_split *);
120 static inline bool equal (const iv_to_split *, const iv_to_split *);
124 /* A hash function for information about insns to split. */
126 inline hashval_t
127 iv_split_hasher::hash (const iv_to_split *ivts)
129 return (hashval_t) INSN_UID (ivts->insn);
132 /* An equality functions for information about insns to split. */
134 inline bool
135 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
137 return i1->insn == i2->insn;
140 /* Hashtable helper for iv_to_split. */
142 struct var_expand_hasher : typed_free_remove <var_to_expand>
144 typedef var_to_expand *value_type;
145 typedef var_to_expand *compare_type;
146 static inline hashval_t hash (const var_to_expand *);
147 static inline bool equal (const var_to_expand *, const var_to_expand *);
150 /* Return a hash for VES. */
152 inline hashval_t
153 var_expand_hasher::hash (const var_to_expand *ves)
155 return (hashval_t) INSN_UID (ves->insn);
158 /* Return true if I1 and I2 refer to the same instruction. */
160 inline bool
161 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
163 return i1->insn == i2->insn;
166 /* Information about optimization applied in
167 the unrolled loop. */
169 struct opt_info
171 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
172 split. */
173 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
174 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
175 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
176 insns with accumulators to expand. */
177 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
178 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
179 unsigned first_new_block; /* The first basic block that was
180 duplicated. */
181 basic_block loop_exit; /* The loop exit basic block. */
182 basic_block loop_preheader; /* The loop preheader basic block. */
185 static void decide_unroll_stupid (struct loop *, int);
186 static void decide_unroll_constant_iterations (struct loop *, int);
187 static void decide_unroll_runtime_iterations (struct loop *, int);
188 static void unroll_loop_stupid (struct loop *);
189 static void decide_unrolling (int);
190 static void unroll_loop_constant_iterations (struct loop *);
191 static void unroll_loop_runtime_iterations (struct loop *);
192 static struct opt_info *analyze_insns_in_loop (struct loop *);
193 static void opt_info_start_duplication (struct opt_info *);
194 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
195 static void free_opt_info (struct opt_info *);
196 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
197 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
198 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
199 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
200 static void insert_var_expansion_initialization (struct var_to_expand *,
201 basic_block);
202 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
203 basic_block);
204 static rtx get_expansion (struct var_to_expand *);
206 /* Emit a message summarizing the unroll that will be
207 performed for LOOP, along with the loop's location LOCUS, if
208 appropriate given the dump or -fopt-info settings. */
210 static void
211 report_unroll (struct loop *loop, location_t locus)
213 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
215 if (loop->lpt_decision.decision == LPT_NONE)
216 return;
218 if (!dump_enabled_p ())
219 return;
221 dump_printf_loc (report_flags, locus,
222 "loop unrolled %d times",
223 loop->lpt_decision.times);
224 if (profile_info)
225 dump_printf (report_flags,
226 " (header execution count %d)",
227 (int)loop->header->count);
229 dump_printf (report_flags, "\n");
232 /* Decide whether unroll loops and how much. */
233 static void
234 decide_unrolling (int flags)
236 struct loop *loop;
238 /* Scan the loops, inner ones first. */
239 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
241 loop->lpt_decision.decision = LPT_NONE;
242 location_t locus = get_loop_location (loop);
244 if (dump_enabled_p ())
245 dump_printf_loc (TDF_RTL, locus,
246 ";; *** Considering loop %d at BB %d for "
247 "unrolling ***\n",
248 loop->num, loop->header->index);
250 /* Do not peel cold areas. */
251 if (optimize_loop_for_size_p (loop))
253 if (dump_file)
254 fprintf (dump_file, ";; Not considering loop, cold area\n");
255 continue;
258 /* Can the loop be manipulated? */
259 if (!can_duplicate_loop_p (loop))
261 if (dump_file)
262 fprintf (dump_file,
263 ";; Not considering loop, cannot duplicate\n");
264 continue;
267 /* Skip non-innermost loops. */
268 if (loop->inner)
270 if (dump_file)
271 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
272 continue;
275 loop->ninsns = num_loop_insns (loop);
276 loop->av_ninsns = average_num_loop_insns (loop);
278 /* Try transformations one by one in decreasing order of
279 priority. */
281 decide_unroll_constant_iterations (loop, flags);
282 if (loop->lpt_decision.decision == LPT_NONE)
283 decide_unroll_runtime_iterations (loop, flags);
284 if (loop->lpt_decision.decision == LPT_NONE)
285 decide_unroll_stupid (loop, flags);
287 report_unroll (loop, locus);
291 /* Unroll LOOPS. */
292 void
293 unroll_loops (int flags)
295 struct loop *loop;
296 bool changed = false;
298 /* Now decide rest of unrolling. */
299 decide_unrolling (flags);
301 /* Scan the loops, inner ones first. */
302 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
304 /* And perform the appropriate transformations. */
305 switch (loop->lpt_decision.decision)
307 case LPT_UNROLL_CONSTANT:
308 unroll_loop_constant_iterations (loop);
309 changed = true;
310 break;
311 case LPT_UNROLL_RUNTIME:
312 unroll_loop_runtime_iterations (loop);
313 changed = true;
314 break;
315 case LPT_UNROLL_STUPID:
316 unroll_loop_stupid (loop);
317 changed = true;
318 break;
319 case LPT_NONE:
320 break;
321 default:
322 gcc_unreachable ();
326 if (changed)
328 calculate_dominance_info (CDI_DOMINATORS);
329 fix_loop_structure (NULL);
332 iv_analysis_done ();
335 /* Check whether exit of the LOOP is at the end of loop body. */
337 static bool
338 loop_exit_at_end_p (struct loop *loop)
340 struct niter_desc *desc = get_simple_loop_desc (loop);
341 rtx_insn *insn;
343 /* We should never have conditional in latch block. */
344 gcc_assert (desc->in_edge->dest != loop->header);
346 if (desc->in_edge->dest != loop->latch)
347 return false;
349 /* Check that the latch is empty. */
350 FOR_BB_INSNS (loop->latch, insn)
352 if (INSN_P (insn) && active_insn_p (insn))
353 return false;
356 return true;
359 /* Decide whether to unroll LOOP iterating constant number of times
360 and how much. */
362 static void
363 decide_unroll_constant_iterations (struct loop *loop, int flags)
365 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
366 struct niter_desc *desc;
367 widest_int iterations;
369 if (!(flags & UAP_UNROLL))
371 /* We were not asked to, just return back silently. */
372 return;
375 if (dump_file)
376 fprintf (dump_file,
377 "\n;; Considering unrolling loop with constant "
378 "number of iterations\n");
380 /* nunroll = total number of copies of the original loop body in
381 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
382 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
383 nunroll_by_av
384 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
385 if (nunroll > nunroll_by_av)
386 nunroll = nunroll_by_av;
387 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
388 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
390 if (targetm.loop_unroll_adjust)
391 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
393 /* Skip big loops. */
394 if (nunroll <= 1)
396 if (dump_file)
397 fprintf (dump_file, ";; Not considering loop, is too big\n");
398 return;
401 /* Check for simple loops. */
402 desc = get_simple_loop_desc (loop);
404 /* Check number of iterations. */
405 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
407 if (dump_file)
408 fprintf (dump_file,
409 ";; Unable to prove that the loop iterates constant times\n");
410 return;
413 /* Check whether the loop rolls enough to consider.
414 Consult also loop bounds and profile; in the case the loop has more
415 than one exit it may well loop less than determined maximal number
416 of iterations. */
417 if (desc->niter < 2 * nunroll
418 || ((get_estimated_loop_iterations (loop, &iterations)
419 || get_max_loop_iterations (loop, &iterations))
420 && wi::ltu_p (iterations, 2 * nunroll)))
422 if (dump_file)
423 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
424 return;
427 /* Success; now compute number of iterations to unroll. We alter
428 nunroll so that as few as possible copies of loop body are
429 necessary, while still not decreasing the number of unrollings
430 too much (at most by 1). */
431 best_copies = 2 * nunroll + 10;
433 i = 2 * nunroll + 2;
434 if (i - 1 >= desc->niter)
435 i = desc->niter - 2;
437 for (; i >= nunroll - 1; i--)
439 unsigned exit_mod = desc->niter % (i + 1);
441 if (!loop_exit_at_end_p (loop))
442 n_copies = exit_mod + i + 1;
443 else if (exit_mod != (unsigned) i
444 || desc->noloop_assumptions != NULL_RTX)
445 n_copies = exit_mod + i + 2;
446 else
447 n_copies = i + 1;
449 if (n_copies < best_copies)
451 best_copies = n_copies;
452 best_unroll = i;
456 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
457 loop->lpt_decision.times = best_unroll;
460 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
461 The transformation does this:
463 for (i = 0; i < 102; i++)
464 body;
466 ==> (LOOP->LPT_DECISION.TIMES == 3)
468 i = 0;
469 body; i++;
470 body; i++;
471 while (i < 102)
473 body; i++;
474 body; i++;
475 body; i++;
476 body; i++;
479 static void
480 unroll_loop_constant_iterations (struct loop *loop)
482 unsigned HOST_WIDE_INT niter;
483 unsigned exit_mod;
484 sbitmap wont_exit;
485 unsigned i;
486 edge e;
487 unsigned max_unroll = loop->lpt_decision.times;
488 struct niter_desc *desc = get_simple_loop_desc (loop);
489 bool exit_at_end = loop_exit_at_end_p (loop);
490 struct opt_info *opt_info = NULL;
491 bool ok;
493 niter = desc->niter;
495 /* Should not get here (such loop should be peeled instead). */
496 gcc_assert (niter > max_unroll + 1);
498 exit_mod = niter % (max_unroll + 1);
500 wont_exit = sbitmap_alloc (max_unroll + 1);
501 bitmap_ones (wont_exit);
503 auto_vec<edge> remove_edges;
504 if (flag_split_ivs_in_unroller
505 || flag_variable_expansion_in_unroller)
506 opt_info = analyze_insns_in_loop (loop);
508 if (!exit_at_end)
510 /* The exit is not at the end of the loop; leave exit test
511 in the first copy, so that the loops that start with test
512 of exit condition have continuous body after unrolling. */
514 if (dump_file)
515 fprintf (dump_file, ";; Condition at beginning of loop.\n");
517 /* Peel exit_mod iterations. */
518 bitmap_clear_bit (wont_exit, 0);
519 if (desc->noloop_assumptions)
520 bitmap_clear_bit (wont_exit, 1);
522 if (exit_mod)
524 opt_info_start_duplication (opt_info);
525 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
526 exit_mod,
527 wont_exit, desc->out_edge,
528 &remove_edges,
529 DLTHE_FLAG_UPDATE_FREQ
530 | (opt_info && exit_mod > 1
531 ? DLTHE_RECORD_COPY_NUMBER
532 : 0));
533 gcc_assert (ok);
535 if (opt_info && exit_mod > 1)
536 apply_opt_in_copies (opt_info, exit_mod, false, false);
538 desc->noloop_assumptions = NULL_RTX;
539 desc->niter -= exit_mod;
540 loop->nb_iterations_upper_bound -= exit_mod;
541 if (loop->any_estimate
542 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
543 loop->nb_iterations_estimate -= exit_mod;
544 else
545 loop->any_estimate = false;
548 bitmap_set_bit (wont_exit, 1);
550 else
552 /* Leave exit test in last copy, for the same reason as above if
553 the loop tests the condition at the end of loop body. */
555 if (dump_file)
556 fprintf (dump_file, ";; Condition at end of loop.\n");
558 /* We know that niter >= max_unroll + 2; so we do not need to care of
559 case when we would exit before reaching the loop. So just peel
560 exit_mod + 1 iterations. */
561 if (exit_mod != max_unroll
562 || desc->noloop_assumptions)
564 bitmap_clear_bit (wont_exit, 0);
565 if (desc->noloop_assumptions)
566 bitmap_clear_bit (wont_exit, 1);
568 opt_info_start_duplication (opt_info);
569 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
570 exit_mod + 1,
571 wont_exit, desc->out_edge,
572 &remove_edges,
573 DLTHE_FLAG_UPDATE_FREQ
574 | (opt_info && exit_mod > 0
575 ? DLTHE_RECORD_COPY_NUMBER
576 : 0));
577 gcc_assert (ok);
579 if (opt_info && exit_mod > 0)
580 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
582 desc->niter -= exit_mod + 1;
583 loop->nb_iterations_upper_bound -= exit_mod + 1;
584 if (loop->any_estimate
585 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
586 loop->nb_iterations_estimate -= exit_mod + 1;
587 else
588 loop->any_estimate = false;
589 desc->noloop_assumptions = NULL_RTX;
591 bitmap_set_bit (wont_exit, 0);
592 bitmap_set_bit (wont_exit, 1);
595 bitmap_clear_bit (wont_exit, max_unroll);
598 /* Now unroll the loop. */
600 opt_info_start_duplication (opt_info);
601 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
602 max_unroll,
603 wont_exit, desc->out_edge,
604 &remove_edges,
605 DLTHE_FLAG_UPDATE_FREQ
606 | (opt_info
607 ? DLTHE_RECORD_COPY_NUMBER
608 : 0));
609 gcc_assert (ok);
611 if (opt_info)
613 apply_opt_in_copies (opt_info, max_unroll, true, true);
614 free_opt_info (opt_info);
617 free (wont_exit);
619 if (exit_at_end)
621 basic_block exit_block = get_bb_copy (desc->in_edge->src);
622 /* Find a new in and out edge; they are in the last copy we have made. */
624 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
626 desc->out_edge = EDGE_SUCC (exit_block, 0);
627 desc->in_edge = EDGE_SUCC (exit_block, 1);
629 else
631 desc->out_edge = EDGE_SUCC (exit_block, 1);
632 desc->in_edge = EDGE_SUCC (exit_block, 0);
636 desc->niter /= max_unroll + 1;
637 loop->nb_iterations_upper_bound
638 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
639 if (loop->any_estimate)
640 loop->nb_iterations_estimate
641 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
642 desc->niter_expr = GEN_INT (desc->niter);
644 /* Remove the edges. */
645 FOR_EACH_VEC_ELT (remove_edges, i, e)
646 remove_path (e);
648 if (dump_file)
649 fprintf (dump_file,
650 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
651 max_unroll, num_loop_insns (loop));
654 /* Decide whether to unroll LOOP iterating runtime computable number of times
655 and how much. */
656 static void
657 decide_unroll_runtime_iterations (struct loop *loop, int flags)
659 unsigned nunroll, nunroll_by_av, i;
660 struct niter_desc *desc;
661 widest_int iterations;
663 if (!(flags & UAP_UNROLL))
665 /* We were not asked to, just return back silently. */
666 return;
669 if (dump_file)
670 fprintf (dump_file,
671 "\n;; Considering unrolling loop with runtime "
672 "computable number of iterations\n");
674 /* nunroll = total number of copies of the original loop body in
675 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
676 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
677 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
678 if (nunroll > nunroll_by_av)
679 nunroll = nunroll_by_av;
680 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
681 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
683 if (targetm.loop_unroll_adjust)
684 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
686 /* Skip big loops. */
687 if (nunroll <= 1)
689 if (dump_file)
690 fprintf (dump_file, ";; Not considering loop, is too big\n");
691 return;
694 /* Check for simple loops. */
695 desc = get_simple_loop_desc (loop);
697 /* Check simpleness. */
698 if (!desc->simple_p || desc->assumptions)
700 if (dump_file)
701 fprintf (dump_file,
702 ";; Unable to prove that the number of iterations "
703 "can be counted in runtime\n");
704 return;
707 if (desc->const_iter)
709 if (dump_file)
710 fprintf (dump_file, ";; Loop iterates constant times\n");
711 return;
714 /* Check whether the loop rolls. */
715 if ((get_estimated_loop_iterations (loop, &iterations)
716 || get_max_loop_iterations (loop, &iterations))
717 && wi::ltu_p (iterations, 2 * nunroll))
719 if (dump_file)
720 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
721 return;
724 /* Success; now force nunroll to be power of 2, as we are unable to
725 cope with overflows in computation of number of iterations. */
726 for (i = 1; 2 * i <= nunroll; i *= 2)
727 continue;
729 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
730 loop->lpt_decision.times = i - 1;
733 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
734 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
735 and NULL is returned instead. */
737 basic_block
738 split_edge_and_insert (edge e, rtx_insn *insns)
740 basic_block bb;
742 if (!insns)
743 return NULL;
744 bb = split_edge (e);
745 emit_insn_after (insns, BB_END (bb));
747 /* ??? We used to assume that INSNS can contain control flow insns, and
748 that we had to try to find sub basic blocks in BB to maintain a valid
749 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
750 and call break_superblocks when going out of cfglayout mode. But it
751 turns out that this never happens; and that if it does ever happen,
752 the verify_flow_info at the end of the RTL loop passes would fail.
754 There are two reasons why we expected we could have control flow insns
755 in INSNS. The first is when a comparison has to be done in parts, and
756 the second is when the number of iterations is computed for loops with
757 the number of iterations known at runtime. In both cases, test cases
758 to get control flow in INSNS appear to be impossible to construct:
760 * If do_compare_rtx_and_jump needs several branches to do comparison
761 in a mode that needs comparison by parts, we cannot analyze the
762 number of iterations of the loop, and we never get to unrolling it.
764 * The code in expand_divmod that was suspected to cause creation of
765 branching code seems to be only accessed for signed division. The
766 divisions used by # of iterations analysis are always unsigned.
767 Problems might arise on architectures that emits branching code
768 for some operations that may appear in the unroller (especially
769 for division), but we have no such architectures.
771 Considering all this, it was decided that we should for now assume
772 that INSNS can in theory contain control flow insns, but in practice
773 it never does. So we don't handle the theoretical case, and should
774 a real failure ever show up, we have a pretty good clue for how to
775 fix it. */
777 return bb;
780 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
781 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
782 in order to create a jump. */
784 static rtx_insn *
785 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
786 rtx_code_label *label, int prob, rtx_insn *cinsn)
788 rtx_insn *seq;
789 rtx_jump_insn *jump;
790 rtx cond;
791 machine_mode mode;
793 mode = GET_MODE (op0);
794 if (mode == VOIDmode)
795 mode = GET_MODE (op1);
797 start_sequence ();
798 if (GET_MODE_CLASS (mode) == MODE_CC)
800 /* A hack -- there seems to be no easy generic way how to make a
801 conditional jump from a ccmode comparison. */
802 gcc_assert (cinsn);
803 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
804 gcc_assert (GET_CODE (cond) == comp);
805 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
806 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
807 emit_jump_insn (copy_insn (PATTERN (cinsn)));
808 jump = as_a <rtx_jump_insn *> (get_last_insn ());
809 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
810 LABEL_NUSES (JUMP_LABEL (jump))++;
811 redirect_jump (jump, label, 0);
813 else
815 gcc_assert (!cinsn);
817 op0 = force_operand (op0, NULL_RTX);
818 op1 = force_operand (op1, NULL_RTX);
819 do_compare_rtx_and_jump (op0, op1, comp, 0,
820 mode, NULL_RTX, NULL, label, -1);
821 jump = as_a <rtx_jump_insn *> (get_last_insn ());
822 jump->set_jump_target (label);
823 LABEL_NUSES (label)++;
825 add_int_reg_note (jump, REG_BR_PROB, prob);
827 seq = get_insns ();
828 end_sequence ();
830 return seq;
833 /* Unroll LOOP for which we are able to count number of iterations in runtime
834 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
835 extra care for case n < 0):
837 for (i = 0; i < n; i++)
838 body;
840 ==> (LOOP->LPT_DECISION.TIMES == 3)
842 i = 0;
843 mod = n % 4;
845 switch (mod)
847 case 3:
848 body; i++;
849 case 2:
850 body; i++;
851 case 1:
852 body; i++;
853 case 0: ;
856 while (i < n)
858 body; i++;
859 body; i++;
860 body; i++;
861 body; i++;
864 static void
865 unroll_loop_runtime_iterations (struct loop *loop)
867 rtx old_niter, niter, tmp;
868 rtx_insn *init_code, *branch_code;
869 unsigned i, j, p;
870 basic_block preheader, *body, swtch, ezc_swtch;
871 sbitmap wont_exit;
872 int may_exit_copy;
873 unsigned n_peel;
874 edge e;
875 bool extra_zero_check, last_may_exit;
876 unsigned max_unroll = loop->lpt_decision.times;
877 struct niter_desc *desc = get_simple_loop_desc (loop);
878 bool exit_at_end = loop_exit_at_end_p (loop);
879 struct opt_info *opt_info = NULL;
880 bool ok;
882 if (flag_split_ivs_in_unroller
883 || flag_variable_expansion_in_unroller)
884 opt_info = analyze_insns_in_loop (loop);
886 /* Remember blocks whose dominators will have to be updated. */
887 auto_vec<basic_block> dom_bbs;
889 body = get_loop_body (loop);
890 for (i = 0; i < loop->num_nodes; i++)
892 vec<basic_block> ldom;
893 basic_block bb;
895 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
896 FOR_EACH_VEC_ELT (ldom, j, bb)
897 if (!flow_bb_inside_loop_p (loop, bb))
898 dom_bbs.safe_push (bb);
900 ldom.release ();
902 free (body);
904 if (!exit_at_end)
906 /* Leave exit in first copy (for explanation why see comment in
907 unroll_loop_constant_iterations). */
908 may_exit_copy = 0;
909 n_peel = max_unroll - 1;
910 extra_zero_check = true;
911 last_may_exit = false;
913 else
915 /* Leave exit in last copy (for explanation why see comment in
916 unroll_loop_constant_iterations). */
917 may_exit_copy = max_unroll;
918 n_peel = max_unroll;
919 extra_zero_check = false;
920 last_may_exit = true;
923 /* Get expression for number of iterations. */
924 start_sequence ();
925 old_niter = niter = gen_reg_rtx (desc->mode);
926 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
927 if (tmp != niter)
928 emit_move_insn (niter, tmp);
930 /* Count modulo by ANDing it with max_unroll; we use the fact that
931 the number of unrollings is a power of two, and thus this is correct
932 even if there is overflow in the computation. */
933 niter = expand_simple_binop (desc->mode, AND,
934 niter, gen_int_mode (max_unroll, desc->mode),
935 NULL_RTX, 0, OPTAB_LIB_WIDEN);
937 init_code = get_insns ();
938 end_sequence ();
939 unshare_all_rtl_in_chain (init_code);
941 /* Precondition the loop. */
942 split_edge_and_insert (loop_preheader_edge (loop), init_code);
944 auto_vec<edge> remove_edges;
946 wont_exit = sbitmap_alloc (max_unroll + 2);
948 /* Peel the first copy of loop body (almost always we must leave exit test
949 here; the only exception is when we have extra zero check and the number
950 of iterations is reliable. Also record the place of (possible) extra
951 zero check. */
952 bitmap_clear (wont_exit);
953 if (extra_zero_check
954 && !desc->noloop_assumptions)
955 bitmap_set_bit (wont_exit, 1);
956 ezc_swtch = loop_preheader_edge (loop)->src;
957 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
958 1, wont_exit, desc->out_edge,
959 &remove_edges,
960 DLTHE_FLAG_UPDATE_FREQ);
961 gcc_assert (ok);
963 /* Record the place where switch will be built for preconditioning. */
964 swtch = split_edge (loop_preheader_edge (loop));
966 for (i = 0; i < n_peel; i++)
968 /* Peel the copy. */
969 bitmap_clear (wont_exit);
970 if (i != n_peel - 1 || !last_may_exit)
971 bitmap_set_bit (wont_exit, 1);
972 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
973 1, wont_exit, desc->out_edge,
974 &remove_edges,
975 DLTHE_FLAG_UPDATE_FREQ);
976 gcc_assert (ok);
978 /* Create item for switch. */
979 j = n_peel - i - (extra_zero_check ? 0 : 1);
980 p = REG_BR_PROB_BASE / (i + 2);
982 preheader = split_edge (loop_preheader_edge (loop));
983 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
984 block_label (preheader), p,
985 NULL);
987 /* We rely on the fact that the compare and jump cannot be optimized out,
988 and hence the cfg we create is correct. */
989 gcc_assert (branch_code != NULL_RTX);
991 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
992 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
993 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
994 e = make_edge (swtch, preheader,
995 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
996 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
997 e->probability = p;
1000 if (extra_zero_check)
1002 /* Add branch for zero iterations. */
1003 p = REG_BR_PROB_BASE / (max_unroll + 1);
1004 swtch = ezc_swtch;
1005 preheader = split_edge (loop_preheader_edge (loop));
1006 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1007 block_label (preheader), p,
1008 NULL);
1009 gcc_assert (branch_code != NULL_RTX);
1011 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1012 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1013 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1014 e = make_edge (swtch, preheader,
1015 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1016 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1017 e->probability = p;
1020 /* Recount dominators for outer blocks. */
1021 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1023 /* And unroll loop. */
1025 bitmap_ones (wont_exit);
1026 bitmap_clear_bit (wont_exit, may_exit_copy);
1027 opt_info_start_duplication (opt_info);
1029 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1030 max_unroll,
1031 wont_exit, desc->out_edge,
1032 &remove_edges,
1033 DLTHE_FLAG_UPDATE_FREQ
1034 | (opt_info
1035 ? DLTHE_RECORD_COPY_NUMBER
1036 : 0));
1037 gcc_assert (ok);
1039 if (opt_info)
1041 apply_opt_in_copies (opt_info, max_unroll, true, true);
1042 free_opt_info (opt_info);
1045 free (wont_exit);
1047 if (exit_at_end)
1049 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1050 /* Find a new in and out edge; they are in the last copy we have
1051 made. */
1053 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1055 desc->out_edge = EDGE_SUCC (exit_block, 0);
1056 desc->in_edge = EDGE_SUCC (exit_block, 1);
1058 else
1060 desc->out_edge = EDGE_SUCC (exit_block, 1);
1061 desc->in_edge = EDGE_SUCC (exit_block, 0);
1065 /* Remove the edges. */
1066 FOR_EACH_VEC_ELT (remove_edges, i, e)
1067 remove_path (e);
1069 /* We must be careful when updating the number of iterations due to
1070 preconditioning and the fact that the value must be valid at entry
1071 of the loop. After passing through the above code, we see that
1072 the correct new number of iterations is this: */
1073 gcc_assert (!desc->const_iter);
1074 desc->niter_expr =
1075 simplify_gen_binary (UDIV, desc->mode, old_niter,
1076 gen_int_mode (max_unroll + 1, desc->mode));
1077 loop->nb_iterations_upper_bound
1078 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1079 if (loop->any_estimate)
1080 loop->nb_iterations_estimate
1081 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1082 if (exit_at_end)
1084 desc->niter_expr =
1085 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1086 desc->noloop_assumptions = NULL_RTX;
1087 --loop->nb_iterations_upper_bound;
1088 if (loop->any_estimate
1089 && loop->nb_iterations_estimate != 0)
1090 --loop->nb_iterations_estimate;
1091 else
1092 loop->any_estimate = false;
1095 if (dump_file)
1096 fprintf (dump_file,
1097 ";; Unrolled loop %d times, counting # of iterations "
1098 "in runtime, %i insns\n",
1099 max_unroll, num_loop_insns (loop));
1102 /* Decide whether to unroll LOOP stupidly and how much. */
1103 static void
1104 decide_unroll_stupid (struct loop *loop, int flags)
1106 unsigned nunroll, nunroll_by_av, i;
1107 struct niter_desc *desc;
1108 widest_int iterations;
1110 if (!(flags & UAP_UNROLL_ALL))
1112 /* We were not asked to, just return back silently. */
1113 return;
1116 if (dump_file)
1117 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1119 /* nunroll = total number of copies of the original loop body in
1120 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1121 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1122 nunroll_by_av
1123 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1124 if (nunroll > nunroll_by_av)
1125 nunroll = nunroll_by_av;
1126 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1127 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1129 if (targetm.loop_unroll_adjust)
1130 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1132 /* Skip big loops. */
1133 if (nunroll <= 1)
1135 if (dump_file)
1136 fprintf (dump_file, ";; Not considering loop, is too big\n");
1137 return;
1140 /* Check for simple loops. */
1141 desc = get_simple_loop_desc (loop);
1143 /* Check simpleness. */
1144 if (desc->simple_p && !desc->assumptions)
1146 if (dump_file)
1147 fprintf (dump_file, ";; The loop is simple\n");
1148 return;
1151 /* Do not unroll loops with branches inside -- it increases number
1152 of mispredicts.
1153 TODO: this heuristic needs tunning; call inside the loop body
1154 is also relatively good reason to not unroll. */
1155 if (num_loop_branches (loop) > 1)
1157 if (dump_file)
1158 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1159 return;
1162 /* Check whether the loop rolls. */
1163 if ((get_estimated_loop_iterations (loop, &iterations)
1164 || get_max_loop_iterations (loop, &iterations))
1165 && wi::ltu_p (iterations, 2 * nunroll))
1167 if (dump_file)
1168 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1169 return;
1172 /* Success. Now force nunroll to be power of 2, as it seems that this
1173 improves results (partially because of better alignments, partially
1174 because of some dark magic). */
1175 for (i = 1; 2 * i <= nunroll; i *= 2)
1176 continue;
1178 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1179 loop->lpt_decision.times = i - 1;
1182 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1184 while (cond)
1185 body;
1187 ==> (LOOP->LPT_DECISION.TIMES == 3)
1189 while (cond)
1191 body;
1192 if (!cond) break;
1193 body;
1194 if (!cond) break;
1195 body;
1196 if (!cond) break;
1197 body;
1200 static void
1201 unroll_loop_stupid (struct loop *loop)
1203 sbitmap wont_exit;
1204 unsigned nunroll = loop->lpt_decision.times;
1205 struct niter_desc *desc = get_simple_loop_desc (loop);
1206 struct opt_info *opt_info = NULL;
1207 bool ok;
1209 if (flag_split_ivs_in_unroller
1210 || flag_variable_expansion_in_unroller)
1211 opt_info = analyze_insns_in_loop (loop);
1214 wont_exit = sbitmap_alloc (nunroll + 1);
1215 bitmap_clear (wont_exit);
1216 opt_info_start_duplication (opt_info);
1218 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1219 nunroll, wont_exit,
1220 NULL, NULL,
1221 DLTHE_FLAG_UPDATE_FREQ
1222 | (opt_info
1223 ? DLTHE_RECORD_COPY_NUMBER
1224 : 0));
1225 gcc_assert (ok);
1227 if (opt_info)
1229 apply_opt_in_copies (opt_info, nunroll, true, true);
1230 free_opt_info (opt_info);
1233 free (wont_exit);
1235 if (desc->simple_p)
1237 /* We indeed may get here provided that there are nontrivial assumptions
1238 for a loop to be really simple. We could update the counts, but the
1239 problem is that we are unable to decide which exit will be taken
1240 (not really true in case the number of iterations is constant,
1241 but no one will do anything with this information, so we do not
1242 worry about it). */
1243 desc->simple_p = false;
1246 if (dump_file)
1247 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1248 nunroll, num_loop_insns (loop));
1251 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1252 Set *DEBUG_USES to the number of debug insns that reference the
1253 variable. */
1255 static bool
1256 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1257 int *debug_uses)
1259 basic_block *body, bb;
1260 unsigned i;
1261 int count_ref = 0;
1262 rtx_insn *insn;
1264 body = get_loop_body (loop);
1265 for (i = 0; i < loop->num_nodes; i++)
1267 bb = body[i];
1269 FOR_BB_INSNS (bb, insn)
1270 if (!rtx_referenced_p (reg, insn))
1271 continue;
1272 else if (DEBUG_INSN_P (insn))
1273 ++*debug_uses;
1274 else if (++count_ref > 1)
1275 break;
1277 free (body);
1278 return (count_ref == 1);
1281 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1283 static void
1284 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1286 basic_block *body, bb;
1287 unsigned i;
1288 rtx_insn *insn;
1290 body = get_loop_body (loop);
1291 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1293 bb = body[i];
1295 FOR_BB_INSNS (bb, insn)
1296 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1297 continue;
1298 else
1300 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1301 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1302 if (!--debug_uses)
1303 break;
1306 free (body);
1309 /* Determine whether INSN contains an accumulator
1310 which can be expanded into separate copies,
1311 one for each copy of the LOOP body.
1313 for (i = 0 ; i < n; i++)
1314 sum += a[i];
1318 sum += a[i]
1319 ....
1320 i = i+1;
1321 sum1 += a[i]
1322 ....
1323 i = i+1
1324 sum2 += a[i];
1325 ....
1327 Return NULL if INSN contains no opportunity for expansion of accumulator.
1328 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1329 information and return a pointer to it.
1332 static struct var_to_expand *
1333 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1335 rtx set, dest, src;
1336 struct var_to_expand *ves;
1337 unsigned accum_pos;
1338 enum rtx_code code;
1339 int debug_uses = 0;
1341 set = single_set (insn);
1342 if (!set)
1343 return NULL;
1345 dest = SET_DEST (set);
1346 src = SET_SRC (set);
1347 code = GET_CODE (src);
1349 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1350 return NULL;
1352 if (FLOAT_MODE_P (GET_MODE (dest)))
1354 if (!flag_associative_math)
1355 return NULL;
1356 /* In the case of FMA, we're also changing the rounding. */
1357 if (code == FMA && !flag_unsafe_math_optimizations)
1358 return NULL;
1361 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1362 in MD. But if there is no optab to generate the insn, we can not
1363 perform the variable expansion. This can happen if an MD provides
1364 an insn but not a named pattern to generate it, for example to avoid
1365 producing code that needs additional mode switches like for x87/mmx.
1367 So we check have_insn_for which looks for an optab for the operation
1368 in SRC. If it doesn't exist, we can't perform the expansion even
1369 though INSN is valid. */
1370 if (!have_insn_for (code, GET_MODE (src)))
1371 return NULL;
1373 if (!REG_P (dest)
1374 && !(GET_CODE (dest) == SUBREG
1375 && REG_P (SUBREG_REG (dest))))
1376 return NULL;
1378 /* Find the accumulator use within the operation. */
1379 if (code == FMA)
1381 /* We only support accumulation via FMA in the ADD position. */
1382 if (!rtx_equal_p (dest, XEXP (src, 2)))
1383 return NULL;
1384 accum_pos = 2;
1386 else if (rtx_equal_p (dest, XEXP (src, 0)))
1387 accum_pos = 0;
1388 else if (rtx_equal_p (dest, XEXP (src, 1)))
1390 /* The method of expansion that we are using; which includes the
1391 initialization of the expansions with zero and the summation of
1392 the expansions at the end of the computation will yield wrong
1393 results for (x = something - x) thus avoid using it in that case. */
1394 if (code == MINUS)
1395 return NULL;
1396 accum_pos = 1;
1398 else
1399 return NULL;
1401 /* It must not otherwise be used. */
1402 if (code == FMA)
1404 if (rtx_referenced_p (dest, XEXP (src, 0))
1405 || rtx_referenced_p (dest, XEXP (src, 1)))
1406 return NULL;
1408 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1409 return NULL;
1411 /* It must be used in exactly one insn. */
1412 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1413 return NULL;
1415 if (dump_file)
1417 fprintf (dump_file, "\n;; Expanding Accumulator ");
1418 print_rtl (dump_file, dest);
1419 fprintf (dump_file, "\n");
1422 if (debug_uses)
1423 /* Instead of resetting the debug insns, we could replace each
1424 debug use in the loop with the sum or product of all expanded
1425 accummulators. Since we'll only know of all expansions at the
1426 end, we'd have to keep track of which vars_to_expand a debug
1427 insn in the loop references, take note of each copy of the
1428 debug insn during unrolling, and when it's all done, compute
1429 the sum or product of each variable and adjust the original
1430 debug insn and each copy thereof. What a pain! */
1431 reset_debug_uses_in_loop (loop, dest, debug_uses);
1433 /* Record the accumulator to expand. */
1434 ves = XNEW (struct var_to_expand);
1435 ves->insn = insn;
1436 ves->reg = copy_rtx (dest);
1437 ves->var_expansions.create (1);
1438 ves->next = NULL;
1439 ves->op = GET_CODE (src);
1440 ves->expansion_count = 0;
1441 ves->reuse_expansion = 0;
1442 return ves;
1445 /* Determine whether there is an induction variable in INSN that
1446 we would like to split during unrolling.
1448 I.e. replace
1450 i = i + 1;
1452 i = i + 1;
1454 i = i + 1;
1457 type chains by
1459 i0 = i + 1
1461 i = i0 + 1
1463 i = i0 + 2
1466 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1467 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1468 pointer to it. */
1470 static struct iv_to_split *
1471 analyze_iv_to_split_insn (rtx_insn *insn)
1473 rtx set, dest;
1474 struct rtx_iv iv;
1475 struct iv_to_split *ivts;
1476 bool ok;
1478 /* For now we just split the basic induction variables. Later this may be
1479 extended for example by selecting also addresses of memory references. */
1480 set = single_set (insn);
1481 if (!set)
1482 return NULL;
1484 dest = SET_DEST (set);
1485 if (!REG_P (dest))
1486 return NULL;
1488 if (!biv_p (insn, dest))
1489 return NULL;
1491 ok = iv_analyze_result (insn, dest, &iv);
1493 /* This used to be an assert under the assumption that if biv_p returns
1494 true that iv_analyze_result must also return true. However, that
1495 assumption is not strictly correct as evidenced by pr25569.
1497 Returning NULL when iv_analyze_result returns false is safe and
1498 avoids the problems in pr25569 until the iv_analyze_* routines
1499 can be fixed, which is apparently hard and time consuming
1500 according to their author. */
1501 if (! ok)
1502 return NULL;
1504 if (iv.step == const0_rtx
1505 || iv.mode != iv.extend_mode)
1506 return NULL;
1508 /* Record the insn to split. */
1509 ivts = XNEW (struct iv_to_split);
1510 ivts->insn = insn;
1511 ivts->orig_var = dest;
1512 ivts->base_var = NULL_RTX;
1513 ivts->step = iv.step;
1514 ivts->next = NULL;
1516 return ivts;
1519 /* Determines which of insns in LOOP can be optimized.
1520 Return a OPT_INFO struct with the relevant hash tables filled
1521 with all insns to be optimized. The FIRST_NEW_BLOCK field
1522 is undefined for the return value. */
1524 static struct opt_info *
1525 analyze_insns_in_loop (struct loop *loop)
1527 basic_block *body, bb;
1528 unsigned i;
1529 struct opt_info *opt_info = XCNEW (struct opt_info);
1530 rtx_insn *insn;
1531 struct iv_to_split *ivts = NULL;
1532 struct var_to_expand *ves = NULL;
1533 iv_to_split **slot1;
1534 var_to_expand **slot2;
1535 vec<edge> edges = get_loop_exit_edges (loop);
1536 edge exit;
1537 bool can_apply = false;
1539 iv_analysis_loop_init (loop);
1541 body = get_loop_body (loop);
1543 if (flag_split_ivs_in_unroller)
1545 opt_info->insns_to_split
1546 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1547 opt_info->iv_to_split_head = NULL;
1548 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1551 /* Record the loop exit bb and loop preheader before the unrolling. */
1552 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1554 if (edges.length () == 1)
1556 exit = edges[0];
1557 if (!(exit->flags & EDGE_COMPLEX))
1559 opt_info->loop_exit = split_edge (exit);
1560 can_apply = true;
1564 if (flag_variable_expansion_in_unroller
1565 && can_apply)
1567 opt_info->insns_with_var_to_expand
1568 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1569 opt_info->var_to_expand_head = NULL;
1570 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1573 for (i = 0; i < loop->num_nodes; i++)
1575 bb = body[i];
1576 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1577 continue;
1579 FOR_BB_INSNS (bb, insn)
1581 if (!INSN_P (insn))
1582 continue;
1584 if (opt_info->insns_to_split)
1585 ivts = analyze_iv_to_split_insn (insn);
1587 if (ivts)
1589 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1590 gcc_assert (*slot1 == NULL);
1591 *slot1 = ivts;
1592 *opt_info->iv_to_split_tail = ivts;
1593 opt_info->iv_to_split_tail = &ivts->next;
1594 continue;
1597 if (opt_info->insns_with_var_to_expand)
1598 ves = analyze_insn_to_expand_var (loop, insn);
1600 if (ves)
1602 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1603 gcc_assert (*slot2 == NULL);
1604 *slot2 = ves;
1605 *opt_info->var_to_expand_tail = ves;
1606 opt_info->var_to_expand_tail = &ves->next;
1611 edges.release ();
1612 free (body);
1613 return opt_info;
1616 /* Called just before loop duplication. Records start of duplicated area
1617 to OPT_INFO. */
1619 static void
1620 opt_info_start_duplication (struct opt_info *opt_info)
1622 if (opt_info)
1623 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1626 /* Determine the number of iterations between initialization of the base
1627 variable and the current copy (N_COPY). N_COPIES is the total number
1628 of newly created copies. UNROLLING is true if we are unrolling
1629 (not peeling) the loop. */
1631 static unsigned
1632 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1634 if (unrolling)
1636 /* If we are unrolling, initialization is done in the original loop
1637 body (number 0). */
1638 return n_copy;
1640 else
1642 /* If we are peeling, the copy in that the initialization occurs has
1643 number 1. The original loop (number 0) is the last. */
1644 if (n_copy)
1645 return n_copy - 1;
1646 else
1647 return n_copies;
1651 /* Allocate basic variable for the induction variable chain. */
1653 static void
1654 allocate_basic_variable (struct iv_to_split *ivts)
1656 rtx expr = SET_SRC (single_set (ivts->insn));
1658 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1661 /* Insert initialization of basic variable of IVTS before INSN, taking
1662 the initial value from INSN. */
1664 static void
1665 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1667 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1668 rtx_insn *seq;
1670 start_sequence ();
1671 expr = force_operand (expr, ivts->base_var);
1672 if (expr != ivts->base_var)
1673 emit_move_insn (ivts->base_var, expr);
1674 seq = get_insns ();
1675 end_sequence ();
1677 emit_insn_before (seq, insn);
1680 /* Replace the use of induction variable described in IVTS in INSN
1681 by base variable + DELTA * step. */
1683 static void
1684 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1686 rtx expr, *loc, incr, var;
1687 rtx_insn *seq;
1688 machine_mode mode = GET_MODE (ivts->base_var);
1689 rtx src, dest, set;
1691 /* Construct base + DELTA * step. */
1692 if (!delta)
1693 expr = ivts->base_var;
1694 else
1696 incr = simplify_gen_binary (MULT, mode,
1697 ivts->step, gen_int_mode (delta, mode));
1698 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1699 ivts->base_var, incr);
1702 /* Figure out where to do the replacement. */
1703 loc = &SET_SRC (single_set (insn));
1705 /* If we can make the replacement right away, we're done. */
1706 if (validate_change (insn, loc, expr, 0))
1707 return;
1709 /* Otherwise, force EXPR into a register and try again. */
1710 start_sequence ();
1711 var = gen_reg_rtx (mode);
1712 expr = force_operand (expr, var);
1713 if (expr != var)
1714 emit_move_insn (var, expr);
1715 seq = get_insns ();
1716 end_sequence ();
1717 emit_insn_before (seq, insn);
1719 if (validate_change (insn, loc, var, 0))
1720 return;
1722 /* The last chance. Try recreating the assignment in insn
1723 completely from scratch. */
1724 set = single_set (insn);
1725 gcc_assert (set);
1727 start_sequence ();
1728 *loc = var;
1729 src = copy_rtx (SET_SRC (set));
1730 dest = copy_rtx (SET_DEST (set));
1731 src = force_operand (src, dest);
1732 if (src != dest)
1733 emit_move_insn (dest, src);
1734 seq = get_insns ();
1735 end_sequence ();
1737 emit_insn_before (seq, insn);
1738 delete_insn (insn);
1742 /* Return one expansion of the accumulator recorded in struct VE. */
1744 static rtx
1745 get_expansion (struct var_to_expand *ve)
1747 rtx reg;
1749 if (ve->reuse_expansion == 0)
1750 reg = ve->reg;
1751 else
1752 reg = ve->var_expansions[ve->reuse_expansion - 1];
1754 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1755 ve->reuse_expansion = 0;
1756 else
1757 ve->reuse_expansion++;
1759 return reg;
1763 /* Given INSN replace the uses of the accumulator recorded in VE
1764 with a new register. */
1766 static void
1767 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1769 rtx new_reg, set;
1770 bool really_new_expansion = false;
1772 set = single_set (insn);
1773 gcc_assert (set);
1775 /* Generate a new register only if the expansion limit has not been
1776 reached. Else reuse an already existing expansion. */
1777 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1779 really_new_expansion = true;
1780 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1782 else
1783 new_reg = get_expansion (ve);
1785 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1786 if (apply_change_group ())
1787 if (really_new_expansion)
1789 ve->var_expansions.safe_push (new_reg);
1790 ve->expansion_count++;
1794 /* Initialize the variable expansions in loop preheader. PLACE is the
1795 loop-preheader basic block where the initialization of the
1796 expansions should take place. The expansions are initialized with
1797 (-0) when the operation is plus or minus to honor sign zero. This
1798 way we can prevent cases where the sign of the final result is
1799 effected by the sign of the expansion. Here is an example to
1800 demonstrate this:
1802 for (i = 0 ; i < n; i++)
1803 sum += something;
1807 sum += something
1808 ....
1809 i = i+1;
1810 sum1 += something
1811 ....
1812 i = i+1
1813 sum2 += something;
1814 ....
1816 When SUM is initialized with -zero and SOMETHING is also -zero; the
1817 final result of sum should be -zero thus the expansions sum1 and sum2
1818 should be initialized with -zero as well (otherwise we will get +zero
1819 as the final result). */
1821 static void
1822 insert_var_expansion_initialization (struct var_to_expand *ve,
1823 basic_block place)
1825 rtx_insn *seq;
1826 rtx var, zero_init;
1827 unsigned i;
1828 machine_mode mode = GET_MODE (ve->reg);
1829 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1831 if (ve->var_expansions.length () == 0)
1832 return;
1834 start_sequence ();
1835 switch (ve->op)
1837 case FMA:
1838 /* Note that we only accumulate FMA via the ADD operand. */
1839 case PLUS:
1840 case MINUS:
1841 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1843 if (honor_signed_zero_p)
1844 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1845 else
1846 zero_init = CONST0_RTX (mode);
1847 emit_move_insn (var, zero_init);
1849 break;
1851 case MULT:
1852 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1854 zero_init = CONST1_RTX (GET_MODE (var));
1855 emit_move_insn (var, zero_init);
1857 break;
1859 default:
1860 gcc_unreachable ();
1863 seq = get_insns ();
1864 end_sequence ();
1866 emit_insn_after (seq, BB_END (place));
1869 /* Combine the variable expansions at the loop exit. PLACE is the
1870 loop exit basic block where the summation of the expansions should
1871 take place. */
1873 static void
1874 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1876 rtx sum = ve->reg;
1877 rtx expr, var;
1878 rtx_insn *seq, *insn;
1879 unsigned i;
1881 if (ve->var_expansions.length () == 0)
1882 return;
1884 start_sequence ();
1885 switch (ve->op)
1887 case FMA:
1888 /* Note that we only accumulate FMA via the ADD operand. */
1889 case PLUS:
1890 case MINUS:
1891 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1892 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1893 break;
1895 case MULT:
1896 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1897 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1898 break;
1900 default:
1901 gcc_unreachable ();
1904 expr = force_operand (sum, ve->reg);
1905 if (expr != ve->reg)
1906 emit_move_insn (ve->reg, expr);
1907 seq = get_insns ();
1908 end_sequence ();
1910 insn = BB_HEAD (place);
1911 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1912 insn = NEXT_INSN (insn);
1914 emit_insn_after (seq, insn);
1917 /* Strip away REG_EQUAL notes for IVs we're splitting.
1919 Updating REG_EQUAL notes for IVs we split is tricky: We
1920 cannot tell until after unrolling, DF-rescanning, and liveness
1921 updating, whether an EQ_USE is reached by the split IV while
1922 the IV reg is still live. See PR55006.
1924 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1925 because RTL loop-iv requires us to defer rescanning insns and
1926 any notes attached to them. So resort to old techniques... */
1928 static void
1929 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1931 struct iv_to_split *ivts;
1932 rtx note = find_reg_equal_equiv_note (insn);
1933 if (! note)
1934 return;
1935 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1936 if (reg_mentioned_p (ivts->orig_var, note))
1938 remove_note (insn, note);
1939 return;
1943 /* Apply loop optimizations in loop copies using the
1944 data which gathered during the unrolling. Structure
1945 OPT_INFO record that data.
1947 UNROLLING is true if we unrolled (not peeled) the loop.
1948 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1949 the loop (as it should happen in complete unrolling, but not in ordinary
1950 peeling of the loop). */
1952 static void
1953 apply_opt_in_copies (struct opt_info *opt_info,
1954 unsigned n_copies, bool unrolling,
1955 bool rewrite_original_loop)
1957 unsigned i, delta;
1958 basic_block bb, orig_bb;
1959 rtx_insn *insn, *orig_insn, *next;
1960 struct iv_to_split ivts_templ, *ivts;
1961 struct var_to_expand ve_templ, *ves;
1963 /* Sanity check -- we need to put initialization in the original loop
1964 body. */
1965 gcc_assert (!unrolling || rewrite_original_loop);
1967 /* Allocate the basic variables (i0). */
1968 if (opt_info->insns_to_split)
1969 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1970 allocate_basic_variable (ivts);
1972 for (i = opt_info->first_new_block;
1973 i < (unsigned) last_basic_block_for_fn (cfun);
1974 i++)
1976 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1977 orig_bb = get_bb_original (bb);
1979 /* bb->aux holds position in copy sequence initialized by
1980 duplicate_loop_to_header_edge. */
1981 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1982 unrolling);
1983 bb->aux = 0;
1984 orig_insn = BB_HEAD (orig_bb);
1985 FOR_BB_INSNS_SAFE (bb, insn, next)
1987 if (!INSN_P (insn)
1988 || (DEBUG_INSN_P (insn)
1989 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1990 continue;
1992 while (!INSN_P (orig_insn)
1993 || (DEBUG_INSN_P (orig_insn)
1994 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1995 == LABEL_DECL)))
1996 orig_insn = NEXT_INSN (orig_insn);
1998 ivts_templ.insn = orig_insn;
1999 ve_templ.insn = orig_insn;
2001 /* Apply splitting iv optimization. */
2002 if (opt_info->insns_to_split)
2004 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2006 ivts = opt_info->insns_to_split->find (&ivts_templ);
2008 if (ivts)
2010 gcc_assert (GET_CODE (PATTERN (insn))
2011 == GET_CODE (PATTERN (orig_insn)));
2013 if (!delta)
2014 insert_base_initialization (ivts, insn);
2015 split_iv (ivts, insn, delta);
2018 /* Apply variable expansion optimization. */
2019 if (unrolling && opt_info->insns_with_var_to_expand)
2021 ves = (struct var_to_expand *)
2022 opt_info->insns_with_var_to_expand->find (&ve_templ);
2023 if (ves)
2025 gcc_assert (GET_CODE (PATTERN (insn))
2026 == GET_CODE (PATTERN (orig_insn)));
2027 expand_var_during_unrolling (ves, insn);
2030 orig_insn = NEXT_INSN (orig_insn);
2034 if (!rewrite_original_loop)
2035 return;
2037 /* Initialize the variable expansions in the loop preheader
2038 and take care of combining them at the loop exit. */
2039 if (opt_info->insns_with_var_to_expand)
2041 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2042 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2043 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2044 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2047 /* Rewrite also the original loop body. Find them as originals of the blocks
2048 in the last copied iteration, i.e. those that have
2049 get_bb_copy (get_bb_original (bb)) == bb. */
2050 for (i = opt_info->first_new_block;
2051 i < (unsigned) last_basic_block_for_fn (cfun);
2052 i++)
2054 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2055 orig_bb = get_bb_original (bb);
2056 if (get_bb_copy (orig_bb) != bb)
2057 continue;
2059 delta = determine_split_iv_delta (0, n_copies, unrolling);
2060 for (orig_insn = BB_HEAD (orig_bb);
2061 orig_insn != NEXT_INSN (BB_END (bb));
2062 orig_insn = next)
2064 next = NEXT_INSN (orig_insn);
2066 if (!INSN_P (orig_insn))
2067 continue;
2069 ivts_templ.insn = orig_insn;
2070 if (opt_info->insns_to_split)
2072 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2074 ivts = (struct iv_to_split *)
2075 opt_info->insns_to_split->find (&ivts_templ);
2076 if (ivts)
2078 if (!delta)
2079 insert_base_initialization (ivts, orig_insn);
2080 split_iv (ivts, orig_insn, delta);
2081 continue;
2089 /* Release OPT_INFO. */
2091 static void
2092 free_opt_info (struct opt_info *opt_info)
2094 delete opt_info->insns_to_split;
2095 opt_info->insns_to_split = NULL;
2096 if (opt_info->insns_with_var_to_expand)
2098 struct var_to_expand *ves;
2100 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2101 ves->var_expansions.release ();
2102 delete opt_info->insns_with_var_to_expand;
2103 opt_info->insns_with_var_to_expand = NULL;
2105 free (opt_info);