gcc/ada/
[official-gcc.git] / gcc / loop-unroll.c
blobec33f7495abcbe294acc484cbc176b2f28c91841
1 /* Loop unrolling.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "profile.h"
29 #include "predict.h"
30 #include "vec.h"
31 #include "hashtab.h"
32 #include "hash-set.h"
33 #include "machmode.h"
34 #include "input.h"
35 #include "function.h"
36 #include "dominance.h"
37 #include "cfg.h"
38 #include "cfgrtl.h"
39 #include "basic-block.h"
40 #include "cfgloop.h"
41 #include "params.h"
42 #include "expr.h"
43 #include "hash-table.h"
44 #include "recog.h"
45 #include "target.h"
46 #include "dumpfile.h"
48 /* This pass performs loop unrolling. We only perform this
49 optimization on innermost loops (with single exception) because
50 the impact on performance is greatest here, and we want to avoid
51 unnecessary code size growth. The gain is caused by greater sequentiality
52 of code, better code to optimize for further passes and in some cases
53 by fewer testings of exit conditions. The main problem is code growth,
54 that impacts performance negatively due to effect of caches.
56 What we do:
58 -- unrolling of loops that roll constant times; this is almost always
59 win, as we get rid of exit condition tests.
60 -- unrolling of loops that roll number of times that we can compute
61 in runtime; we also get rid of exit condition tests here, but there
62 is the extra expense for calculating the number of iterations
63 -- simple unrolling of remaining loops; this is performed only if we
64 are asked to, as the gain is questionable in this case and often
65 it may even slow down the code
66 For more detailed descriptions of each of those, see comments at
67 appropriate function below.
69 There is a lot of parameters (defined and described in params.def) that
70 control how much we unroll.
72 ??? A great problem is that we don't have a good way how to determine
73 how many times we should unroll the loop; the experiments I have made
74 showed that this choice may affect performance in order of several %.
77 /* Information about induction variables to split. */
79 struct iv_to_split
81 rtx_insn *insn; /* The insn in that the induction variable occurs. */
82 rtx orig_var; /* The variable (register) for the IV before split. */
83 rtx base_var; /* The variable on that the values in the further
84 iterations are based. */
85 rtx step; /* Step of the induction variable. */
86 struct iv_to_split *next; /* Next entry in walking order. */
89 /* Information about accumulators to expand. */
91 struct var_to_expand
93 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
94 rtx reg; /* The accumulator which is expanded. */
95 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
96 struct var_to_expand *next; /* Next entry in walking order. */
97 enum rtx_code op; /* The type of the accumulation - addition, subtraction
98 or multiplication. */
99 int expansion_count; /* Count the number of expansions generated so far. */
100 int reuse_expansion; /* The expansion we intend to reuse to expand
101 the accumulator. If REUSE_EXPANSION is 0 reuse
102 the original accumulator. Else use
103 var_expansions[REUSE_EXPANSION - 1]. */
106 /* Hashtable helper for iv_to_split. */
108 struct iv_split_hasher : typed_free_remove <iv_to_split>
110 typedef iv_to_split value_type;
111 typedef iv_to_split compare_type;
112 static inline hashval_t hash (const value_type *);
113 static inline bool equal (const value_type *, const compare_type *);
117 /* A hash function for information about insns to split. */
119 inline hashval_t
120 iv_split_hasher::hash (const value_type *ivts)
122 return (hashval_t) INSN_UID (ivts->insn);
125 /* An equality functions for information about insns to split. */
127 inline bool
128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
130 return i1->insn == i2->insn;
133 /* Hashtable helper for iv_to_split. */
135 struct var_expand_hasher : typed_free_remove <var_to_expand>
137 typedef var_to_expand value_type;
138 typedef var_to_expand compare_type;
139 static inline hashval_t hash (const value_type *);
140 static inline bool equal (const value_type *, const compare_type *);
143 /* Return a hash for VES. */
145 inline hashval_t
146 var_expand_hasher::hash (const value_type *ves)
148 return (hashval_t) INSN_UID (ves->insn);
151 /* Return true if I1 and I2 refer to the same instruction. */
153 inline bool
154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
156 return i1->insn == i2->insn;
159 /* Information about optimization applied in
160 the unrolled loop. */
162 struct opt_info
164 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
165 split. */
166 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
167 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
168 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
169 insns with accumulators to expand. */
170 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
171 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
172 unsigned first_new_block; /* The first basic block that was
173 duplicated. */
174 basic_block loop_exit; /* The loop exit basic block. */
175 basic_block loop_preheader; /* The loop preheader basic block. */
178 static void decide_unroll_stupid (struct loop *, int);
179 static void decide_unroll_constant_iterations (struct loop *, int);
180 static void decide_unroll_runtime_iterations (struct loop *, int);
181 static void unroll_loop_stupid (struct loop *);
182 static void decide_unrolling (int);
183 static void unroll_loop_constant_iterations (struct loop *);
184 static void unroll_loop_runtime_iterations (struct loop *);
185 static struct opt_info *analyze_insns_in_loop (struct loop *);
186 static void opt_info_start_duplication (struct opt_info *);
187 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
188 static void free_opt_info (struct opt_info *);
189 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
190 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
191 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
192 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
193 static void insert_var_expansion_initialization (struct var_to_expand *,
194 basic_block);
195 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
196 basic_block);
197 static rtx get_expansion (struct var_to_expand *);
199 /* Emit a message summarizing the unroll that will be
200 performed for LOOP, along with the loop's location LOCUS, if
201 appropriate given the dump or -fopt-info settings. */
203 static void
204 report_unroll (struct loop *loop, location_t locus)
206 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
208 if (loop->lpt_decision.decision == LPT_NONE)
209 return;
211 if (!dump_enabled_p ())
212 return;
214 dump_printf_loc (report_flags, locus,
215 "loop unrolled %d times",
216 loop->lpt_decision.times);
217 if (profile_info)
218 dump_printf (report_flags,
219 " (header execution count %d)",
220 (int)loop->header->count);
222 dump_printf (report_flags, "\n");
225 /* Decide whether unroll loops and how much. */
226 static void
227 decide_unrolling (int flags)
229 struct loop *loop;
231 /* Scan the loops, inner ones first. */
232 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
234 loop->lpt_decision.decision = LPT_NONE;
235 location_t locus = get_loop_location (loop);
237 if (dump_enabled_p ())
238 dump_printf_loc (TDF_RTL, locus,
239 ";; *** Considering loop %d at BB %d for "
240 "unrolling ***\n",
241 loop->num, loop->header->index);
243 /* Do not peel cold areas. */
244 if (optimize_loop_for_size_p (loop))
246 if (dump_file)
247 fprintf (dump_file, ";; Not considering loop, cold area\n");
248 continue;
251 /* Can the loop be manipulated? */
252 if (!can_duplicate_loop_p (loop))
254 if (dump_file)
255 fprintf (dump_file,
256 ";; Not considering loop, cannot duplicate\n");
257 continue;
260 /* Skip non-innermost loops. */
261 if (loop->inner)
263 if (dump_file)
264 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
265 continue;
268 loop->ninsns = num_loop_insns (loop);
269 loop->av_ninsns = average_num_loop_insns (loop);
271 /* Try transformations one by one in decreasing order of
272 priority. */
274 decide_unroll_constant_iterations (loop, flags);
275 if (loop->lpt_decision.decision == LPT_NONE)
276 decide_unroll_runtime_iterations (loop, flags);
277 if (loop->lpt_decision.decision == LPT_NONE)
278 decide_unroll_stupid (loop, flags);
280 report_unroll (loop, locus);
284 /* Unroll LOOPS. */
285 void
286 unroll_loops (int flags)
288 struct loop *loop;
289 bool changed = false;
291 /* Now decide rest of unrolling. */
292 decide_unrolling (flags);
294 /* Scan the loops, inner ones first. */
295 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
297 /* And perform the appropriate transformations. */
298 switch (loop->lpt_decision.decision)
300 case LPT_UNROLL_CONSTANT:
301 unroll_loop_constant_iterations (loop);
302 changed = true;
303 break;
304 case LPT_UNROLL_RUNTIME:
305 unroll_loop_runtime_iterations (loop);
306 changed = true;
307 break;
308 case LPT_UNROLL_STUPID:
309 unroll_loop_stupid (loop);
310 changed = true;
311 break;
312 case LPT_NONE:
313 break;
314 default:
315 gcc_unreachable ();
319 if (changed)
321 calculate_dominance_info (CDI_DOMINATORS);
322 fix_loop_structure (NULL);
325 iv_analysis_done ();
328 /* Check whether exit of the LOOP is at the end of loop body. */
330 static bool
331 loop_exit_at_end_p (struct loop *loop)
333 struct niter_desc *desc = get_simple_loop_desc (loop);
334 rtx_insn *insn;
336 /* We should never have conditional in latch block. */
337 gcc_assert (desc->in_edge->dest != loop->header);
339 if (desc->in_edge->dest != loop->latch)
340 return false;
342 /* Check that the latch is empty. */
343 FOR_BB_INSNS (loop->latch, insn)
345 if (INSN_P (insn) && active_insn_p (insn))
346 return false;
349 return true;
352 /* Decide whether to unroll LOOP iterating constant number of times
353 and how much. */
355 static void
356 decide_unroll_constant_iterations (struct loop *loop, int flags)
358 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
359 struct niter_desc *desc;
360 widest_int iterations;
362 if (!(flags & UAP_UNROLL))
364 /* We were not asked to, just return back silently. */
365 return;
368 if (dump_file)
369 fprintf (dump_file,
370 "\n;; Considering unrolling loop with constant "
371 "number of iterations\n");
373 /* nunroll = total number of copies of the original loop body in
374 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
375 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
376 nunroll_by_av
377 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
378 if (nunroll > nunroll_by_av)
379 nunroll = nunroll_by_av;
380 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
381 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
383 if (targetm.loop_unroll_adjust)
384 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
386 /* Skip big loops. */
387 if (nunroll <= 1)
389 if (dump_file)
390 fprintf (dump_file, ";; Not considering loop, is too big\n");
391 return;
394 /* Check for simple loops. */
395 desc = get_simple_loop_desc (loop);
397 /* Check number of iterations. */
398 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
400 if (dump_file)
401 fprintf (dump_file,
402 ";; Unable to prove that the loop iterates constant times\n");
403 return;
406 /* Check whether the loop rolls enough to consider.
407 Consult also loop bounds and profile; in the case the loop has more
408 than one exit it may well loop less than determined maximal number
409 of iterations. */
410 if (desc->niter < 2 * nunroll
411 || ((get_estimated_loop_iterations (loop, &iterations)
412 || get_max_loop_iterations (loop, &iterations))
413 && wi::ltu_p (iterations, 2 * nunroll)))
415 if (dump_file)
416 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
417 return;
420 /* Success; now compute number of iterations to unroll. We alter
421 nunroll so that as few as possible copies of loop body are
422 necessary, while still not decreasing the number of unrollings
423 too much (at most by 1). */
424 best_copies = 2 * nunroll + 10;
426 i = 2 * nunroll + 2;
427 if (i - 1 >= desc->niter)
428 i = desc->niter - 2;
430 for (; i >= nunroll - 1; i--)
432 unsigned exit_mod = desc->niter % (i + 1);
434 if (!loop_exit_at_end_p (loop))
435 n_copies = exit_mod + i + 1;
436 else if (exit_mod != (unsigned) i
437 || desc->noloop_assumptions != NULL_RTX)
438 n_copies = exit_mod + i + 2;
439 else
440 n_copies = i + 1;
442 if (n_copies < best_copies)
444 best_copies = n_copies;
445 best_unroll = i;
449 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
450 loop->lpt_decision.times = best_unroll;
453 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
454 The transformation does this:
456 for (i = 0; i < 102; i++)
457 body;
459 ==> (LOOP->LPT_DECISION.TIMES == 3)
461 i = 0;
462 body; i++;
463 body; i++;
464 while (i < 102)
466 body; i++;
467 body; i++;
468 body; i++;
469 body; i++;
472 static void
473 unroll_loop_constant_iterations (struct loop *loop)
475 unsigned HOST_WIDE_INT niter;
476 unsigned exit_mod;
477 sbitmap wont_exit;
478 unsigned i;
479 edge e;
480 unsigned max_unroll = loop->lpt_decision.times;
481 struct niter_desc *desc = get_simple_loop_desc (loop);
482 bool exit_at_end = loop_exit_at_end_p (loop);
483 struct opt_info *opt_info = NULL;
484 bool ok;
486 niter = desc->niter;
488 /* Should not get here (such loop should be peeled instead). */
489 gcc_assert (niter > max_unroll + 1);
491 exit_mod = niter % (max_unroll + 1);
493 wont_exit = sbitmap_alloc (max_unroll + 1);
494 bitmap_ones (wont_exit);
496 auto_vec<edge> remove_edges;
497 if (flag_split_ivs_in_unroller
498 || flag_variable_expansion_in_unroller)
499 opt_info = analyze_insns_in_loop (loop);
501 if (!exit_at_end)
503 /* The exit is not at the end of the loop; leave exit test
504 in the first copy, so that the loops that start with test
505 of exit condition have continuous body after unrolling. */
507 if (dump_file)
508 fprintf (dump_file, ";; Condition at beginning of loop.\n");
510 /* Peel exit_mod iterations. */
511 bitmap_clear_bit (wont_exit, 0);
512 if (desc->noloop_assumptions)
513 bitmap_clear_bit (wont_exit, 1);
515 if (exit_mod)
517 opt_info_start_duplication (opt_info);
518 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
519 exit_mod,
520 wont_exit, desc->out_edge,
521 &remove_edges,
522 DLTHE_FLAG_UPDATE_FREQ
523 | (opt_info && exit_mod > 1
524 ? DLTHE_RECORD_COPY_NUMBER
525 : 0));
526 gcc_assert (ok);
528 if (opt_info && exit_mod > 1)
529 apply_opt_in_copies (opt_info, exit_mod, false, false);
531 desc->noloop_assumptions = NULL_RTX;
532 desc->niter -= exit_mod;
533 loop->nb_iterations_upper_bound -= exit_mod;
534 if (loop->any_estimate
535 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
536 loop->nb_iterations_estimate -= exit_mod;
537 else
538 loop->any_estimate = false;
541 bitmap_set_bit (wont_exit, 1);
543 else
545 /* Leave exit test in last copy, for the same reason as above if
546 the loop tests the condition at the end of loop body. */
548 if (dump_file)
549 fprintf (dump_file, ";; Condition at end of loop.\n");
551 /* We know that niter >= max_unroll + 2; so we do not need to care of
552 case when we would exit before reaching the loop. So just peel
553 exit_mod + 1 iterations. */
554 if (exit_mod != max_unroll
555 || desc->noloop_assumptions)
557 bitmap_clear_bit (wont_exit, 0);
558 if (desc->noloop_assumptions)
559 bitmap_clear_bit (wont_exit, 1);
561 opt_info_start_duplication (opt_info);
562 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
563 exit_mod + 1,
564 wont_exit, desc->out_edge,
565 &remove_edges,
566 DLTHE_FLAG_UPDATE_FREQ
567 | (opt_info && exit_mod > 0
568 ? DLTHE_RECORD_COPY_NUMBER
569 : 0));
570 gcc_assert (ok);
572 if (opt_info && exit_mod > 0)
573 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
575 desc->niter -= exit_mod + 1;
576 loop->nb_iterations_upper_bound -= exit_mod + 1;
577 if (loop->any_estimate
578 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
579 loop->nb_iterations_estimate -= exit_mod + 1;
580 else
581 loop->any_estimate = false;
582 desc->noloop_assumptions = NULL_RTX;
584 bitmap_set_bit (wont_exit, 0);
585 bitmap_set_bit (wont_exit, 1);
588 bitmap_clear_bit (wont_exit, max_unroll);
591 /* Now unroll the loop. */
593 opt_info_start_duplication (opt_info);
594 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
595 max_unroll,
596 wont_exit, desc->out_edge,
597 &remove_edges,
598 DLTHE_FLAG_UPDATE_FREQ
599 | (opt_info
600 ? DLTHE_RECORD_COPY_NUMBER
601 : 0));
602 gcc_assert (ok);
604 if (opt_info)
606 apply_opt_in_copies (opt_info, max_unroll, true, true);
607 free_opt_info (opt_info);
610 free (wont_exit);
612 if (exit_at_end)
614 basic_block exit_block = get_bb_copy (desc->in_edge->src);
615 /* Find a new in and out edge; they are in the last copy we have made. */
617 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
619 desc->out_edge = EDGE_SUCC (exit_block, 0);
620 desc->in_edge = EDGE_SUCC (exit_block, 1);
622 else
624 desc->out_edge = EDGE_SUCC (exit_block, 1);
625 desc->in_edge = EDGE_SUCC (exit_block, 0);
629 desc->niter /= max_unroll + 1;
630 loop->nb_iterations_upper_bound
631 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
632 if (loop->any_estimate)
633 loop->nb_iterations_estimate
634 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
635 desc->niter_expr = GEN_INT (desc->niter);
637 /* Remove the edges. */
638 FOR_EACH_VEC_ELT (remove_edges, i, e)
639 remove_path (e);
641 if (dump_file)
642 fprintf (dump_file,
643 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
644 max_unroll, num_loop_insns (loop));
647 /* Decide whether to unroll LOOP iterating runtime computable number of times
648 and how much. */
649 static void
650 decide_unroll_runtime_iterations (struct loop *loop, int flags)
652 unsigned nunroll, nunroll_by_av, i;
653 struct niter_desc *desc;
654 widest_int iterations;
656 if (!(flags & UAP_UNROLL))
658 /* We were not asked to, just return back silently. */
659 return;
662 if (dump_file)
663 fprintf (dump_file,
664 "\n;; Considering unrolling loop with runtime "
665 "computable number of iterations\n");
667 /* nunroll = total number of copies of the original loop body in
668 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
669 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
670 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
671 if (nunroll > nunroll_by_av)
672 nunroll = nunroll_by_av;
673 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
674 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
676 if (targetm.loop_unroll_adjust)
677 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
679 /* Skip big loops. */
680 if (nunroll <= 1)
682 if (dump_file)
683 fprintf (dump_file, ";; Not considering loop, is too big\n");
684 return;
687 /* Check for simple loops. */
688 desc = get_simple_loop_desc (loop);
690 /* Check simpleness. */
691 if (!desc->simple_p || desc->assumptions)
693 if (dump_file)
694 fprintf (dump_file,
695 ";; Unable to prove that the number of iterations "
696 "can be counted in runtime\n");
697 return;
700 if (desc->const_iter)
702 if (dump_file)
703 fprintf (dump_file, ";; Loop iterates constant times\n");
704 return;
707 /* Check whether the loop rolls. */
708 if ((get_estimated_loop_iterations (loop, &iterations)
709 || get_max_loop_iterations (loop, &iterations))
710 && wi::ltu_p (iterations, 2 * nunroll))
712 if (dump_file)
713 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
714 return;
717 /* Success; now force nunroll to be power of 2, as we are unable to
718 cope with overflows in computation of number of iterations. */
719 for (i = 1; 2 * i <= nunroll; i *= 2)
720 continue;
722 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
723 loop->lpt_decision.times = i - 1;
726 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
727 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
728 and NULL is returned instead. */
730 basic_block
731 split_edge_and_insert (edge e, rtx_insn *insns)
733 basic_block bb;
735 if (!insns)
736 return NULL;
737 bb = split_edge (e);
738 emit_insn_after (insns, BB_END (bb));
740 /* ??? We used to assume that INSNS can contain control flow insns, and
741 that we had to try to find sub basic blocks in BB to maintain a valid
742 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
743 and call break_superblocks when going out of cfglayout mode. But it
744 turns out that this never happens; and that if it does ever happen,
745 the verify_flow_info at the end of the RTL loop passes would fail.
747 There are two reasons why we expected we could have control flow insns
748 in INSNS. The first is when a comparison has to be done in parts, and
749 the second is when the number of iterations is computed for loops with
750 the number of iterations known at runtime. In both cases, test cases
751 to get control flow in INSNS appear to be impossible to construct:
753 * If do_compare_rtx_and_jump needs several branches to do comparison
754 in a mode that needs comparison by parts, we cannot analyze the
755 number of iterations of the loop, and we never get to unrolling it.
757 * The code in expand_divmod that was suspected to cause creation of
758 branching code seems to be only accessed for signed division. The
759 divisions used by # of iterations analysis are always unsigned.
760 Problems might arise on architectures that emits branching code
761 for some operations that may appear in the unroller (especially
762 for division), but we have no such architectures.
764 Considering all this, it was decided that we should for now assume
765 that INSNS can in theory contain control flow insns, but in practice
766 it never does. So we don't handle the theoretical case, and should
767 a real failure ever show up, we have a pretty good clue for how to
768 fix it. */
770 return bb;
773 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
774 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
775 in order to create a jump. */
777 static rtx_insn *
778 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
779 rtx_insn *cinsn)
781 rtx_insn *seq, *jump;
782 rtx cond;
783 machine_mode mode;
785 mode = GET_MODE (op0);
786 if (mode == VOIDmode)
787 mode = GET_MODE (op1);
789 start_sequence ();
790 if (GET_MODE_CLASS (mode) == MODE_CC)
792 /* A hack -- there seems to be no easy generic way how to make a
793 conditional jump from a ccmode comparison. */
794 gcc_assert (cinsn);
795 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
796 gcc_assert (GET_CODE (cond) == comp);
797 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
798 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
799 emit_jump_insn (copy_insn (PATTERN (cinsn)));
800 jump = get_last_insn ();
801 gcc_assert (JUMP_P (jump));
802 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
803 LABEL_NUSES (JUMP_LABEL (jump))++;
804 redirect_jump (jump, label, 0);
806 else
808 gcc_assert (!cinsn);
810 op0 = force_operand (op0, NULL_RTX);
811 op1 = force_operand (op1, NULL_RTX);
812 do_compare_rtx_and_jump (op0, op1, comp, 0,
813 mode, NULL_RTX, NULL_RTX, label, -1);
814 jump = get_last_insn ();
815 gcc_assert (JUMP_P (jump));
816 JUMP_LABEL (jump) = label;
817 LABEL_NUSES (label)++;
819 add_int_reg_note (jump, REG_BR_PROB, prob);
821 seq = get_insns ();
822 end_sequence ();
824 return seq;
827 /* Unroll LOOP for which we are able to count number of iterations in runtime
828 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
829 extra care for case n < 0):
831 for (i = 0; i < n; i++)
832 body;
834 ==> (LOOP->LPT_DECISION.TIMES == 3)
836 i = 0;
837 mod = n % 4;
839 switch (mod)
841 case 3:
842 body; i++;
843 case 2:
844 body; i++;
845 case 1:
846 body; i++;
847 case 0: ;
850 while (i < n)
852 body; i++;
853 body; i++;
854 body; i++;
855 body; i++;
858 static void
859 unroll_loop_runtime_iterations (struct loop *loop)
861 rtx old_niter, niter, tmp;
862 rtx_insn *init_code, *branch_code;
863 unsigned i, j, p;
864 basic_block preheader, *body, swtch, ezc_swtch;
865 sbitmap wont_exit;
866 int may_exit_copy;
867 unsigned n_peel;
868 edge e;
869 bool extra_zero_check, last_may_exit;
870 unsigned max_unroll = loop->lpt_decision.times;
871 struct niter_desc *desc = get_simple_loop_desc (loop);
872 bool exit_at_end = loop_exit_at_end_p (loop);
873 struct opt_info *opt_info = NULL;
874 bool ok;
876 if (flag_split_ivs_in_unroller
877 || flag_variable_expansion_in_unroller)
878 opt_info = analyze_insns_in_loop (loop);
880 /* Remember blocks whose dominators will have to be updated. */
881 auto_vec<basic_block> dom_bbs;
883 body = get_loop_body (loop);
884 for (i = 0; i < loop->num_nodes; i++)
886 vec<basic_block> ldom;
887 basic_block bb;
889 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
890 FOR_EACH_VEC_ELT (ldom, j, bb)
891 if (!flow_bb_inside_loop_p (loop, bb))
892 dom_bbs.safe_push (bb);
894 ldom.release ();
896 free (body);
898 if (!exit_at_end)
900 /* Leave exit in first copy (for explanation why see comment in
901 unroll_loop_constant_iterations). */
902 may_exit_copy = 0;
903 n_peel = max_unroll - 1;
904 extra_zero_check = true;
905 last_may_exit = false;
907 else
909 /* Leave exit in last copy (for explanation why see comment in
910 unroll_loop_constant_iterations). */
911 may_exit_copy = max_unroll;
912 n_peel = max_unroll;
913 extra_zero_check = false;
914 last_may_exit = true;
917 /* Get expression for number of iterations. */
918 start_sequence ();
919 old_niter = niter = gen_reg_rtx (desc->mode);
920 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
921 if (tmp != niter)
922 emit_move_insn (niter, tmp);
924 /* Count modulo by ANDing it with max_unroll; we use the fact that
925 the number of unrollings is a power of two, and thus this is correct
926 even if there is overflow in the computation. */
927 niter = expand_simple_binop (desc->mode, AND,
928 niter, gen_int_mode (max_unroll, desc->mode),
929 NULL_RTX, 0, OPTAB_LIB_WIDEN);
931 init_code = get_insns ();
932 end_sequence ();
933 unshare_all_rtl_in_chain (init_code);
935 /* Precondition the loop. */
936 split_edge_and_insert (loop_preheader_edge (loop), init_code);
938 auto_vec<edge> remove_edges;
940 wont_exit = sbitmap_alloc (max_unroll + 2);
942 /* Peel the first copy of loop body (almost always we must leave exit test
943 here; the only exception is when we have extra zero check and the number
944 of iterations is reliable. Also record the place of (possible) extra
945 zero check. */
946 bitmap_clear (wont_exit);
947 if (extra_zero_check
948 && !desc->noloop_assumptions)
949 bitmap_set_bit (wont_exit, 1);
950 ezc_swtch = loop_preheader_edge (loop)->src;
951 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
952 1, wont_exit, desc->out_edge,
953 &remove_edges,
954 DLTHE_FLAG_UPDATE_FREQ);
955 gcc_assert (ok);
957 /* Record the place where switch will be built for preconditioning. */
958 swtch = split_edge (loop_preheader_edge (loop));
960 for (i = 0; i < n_peel; i++)
962 /* Peel the copy. */
963 bitmap_clear (wont_exit);
964 if (i != n_peel - 1 || !last_may_exit)
965 bitmap_set_bit (wont_exit, 1);
966 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
967 1, wont_exit, desc->out_edge,
968 &remove_edges,
969 DLTHE_FLAG_UPDATE_FREQ);
970 gcc_assert (ok);
972 /* Create item for switch. */
973 j = n_peel - i - (extra_zero_check ? 0 : 1);
974 p = REG_BR_PROB_BASE / (i + 2);
976 preheader = split_edge (loop_preheader_edge (loop));
977 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
978 block_label (preheader), p,
979 NULL);
981 /* We rely on the fact that the compare and jump cannot be optimized out,
982 and hence the cfg we create is correct. */
983 gcc_assert (branch_code != NULL_RTX);
985 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
986 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
987 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
988 e = make_edge (swtch, preheader,
989 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
990 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
991 e->probability = p;
994 if (extra_zero_check)
996 /* Add branch for zero iterations. */
997 p = REG_BR_PROB_BASE / (max_unroll + 1);
998 swtch = ezc_swtch;
999 preheader = split_edge (loop_preheader_edge (loop));
1000 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1001 block_label (preheader), p,
1002 NULL);
1003 gcc_assert (branch_code != NULL_RTX);
1005 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1006 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1007 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1008 e = make_edge (swtch, preheader,
1009 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1010 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1011 e->probability = p;
1014 /* Recount dominators for outer blocks. */
1015 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1017 /* And unroll loop. */
1019 bitmap_ones (wont_exit);
1020 bitmap_clear_bit (wont_exit, may_exit_copy);
1021 opt_info_start_duplication (opt_info);
1023 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1024 max_unroll,
1025 wont_exit, desc->out_edge,
1026 &remove_edges,
1027 DLTHE_FLAG_UPDATE_FREQ
1028 | (opt_info
1029 ? DLTHE_RECORD_COPY_NUMBER
1030 : 0));
1031 gcc_assert (ok);
1033 if (opt_info)
1035 apply_opt_in_copies (opt_info, max_unroll, true, true);
1036 free_opt_info (opt_info);
1039 free (wont_exit);
1041 if (exit_at_end)
1043 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1044 /* Find a new in and out edge; they are in the last copy we have
1045 made. */
1047 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1049 desc->out_edge = EDGE_SUCC (exit_block, 0);
1050 desc->in_edge = EDGE_SUCC (exit_block, 1);
1052 else
1054 desc->out_edge = EDGE_SUCC (exit_block, 1);
1055 desc->in_edge = EDGE_SUCC (exit_block, 0);
1059 /* Remove the edges. */
1060 FOR_EACH_VEC_ELT (remove_edges, i, e)
1061 remove_path (e);
1063 /* We must be careful when updating the number of iterations due to
1064 preconditioning and the fact that the value must be valid at entry
1065 of the loop. After passing through the above code, we see that
1066 the correct new number of iterations is this: */
1067 gcc_assert (!desc->const_iter);
1068 desc->niter_expr =
1069 simplify_gen_binary (UDIV, desc->mode, old_niter,
1070 gen_int_mode (max_unroll + 1, desc->mode));
1071 loop->nb_iterations_upper_bound
1072 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1073 if (loop->any_estimate)
1074 loop->nb_iterations_estimate
1075 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1076 if (exit_at_end)
1078 desc->niter_expr =
1079 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1080 desc->noloop_assumptions = NULL_RTX;
1081 --loop->nb_iterations_upper_bound;
1082 if (loop->any_estimate
1083 && loop->nb_iterations_estimate != 0)
1084 --loop->nb_iterations_estimate;
1085 else
1086 loop->any_estimate = false;
1089 if (dump_file)
1090 fprintf (dump_file,
1091 ";; Unrolled loop %d times, counting # of iterations "
1092 "in runtime, %i insns\n",
1093 max_unroll, num_loop_insns (loop));
1096 /* Decide whether to unroll LOOP stupidly and how much. */
1097 static void
1098 decide_unroll_stupid (struct loop *loop, int flags)
1100 unsigned nunroll, nunroll_by_av, i;
1101 struct niter_desc *desc;
1102 widest_int iterations;
1104 if (!(flags & UAP_UNROLL_ALL))
1106 /* We were not asked to, just return back silently. */
1107 return;
1110 if (dump_file)
1111 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1113 /* nunroll = total number of copies of the original loop body in
1114 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1115 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1116 nunroll_by_av
1117 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1118 if (nunroll > nunroll_by_av)
1119 nunroll = nunroll_by_av;
1120 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1121 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1123 if (targetm.loop_unroll_adjust)
1124 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1126 /* Skip big loops. */
1127 if (nunroll <= 1)
1129 if (dump_file)
1130 fprintf (dump_file, ";; Not considering loop, is too big\n");
1131 return;
1134 /* Check for simple loops. */
1135 desc = get_simple_loop_desc (loop);
1137 /* Check simpleness. */
1138 if (desc->simple_p && !desc->assumptions)
1140 if (dump_file)
1141 fprintf (dump_file, ";; The loop is simple\n");
1142 return;
1145 /* Do not unroll loops with branches inside -- it increases number
1146 of mispredicts.
1147 TODO: this heuristic needs tunning; call inside the loop body
1148 is also relatively good reason to not unroll. */
1149 if (num_loop_branches (loop) > 1)
1151 if (dump_file)
1152 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1153 return;
1156 /* Check whether the loop rolls. */
1157 if ((get_estimated_loop_iterations (loop, &iterations)
1158 || get_max_loop_iterations (loop, &iterations))
1159 && wi::ltu_p (iterations, 2 * nunroll))
1161 if (dump_file)
1162 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1163 return;
1166 /* Success. Now force nunroll to be power of 2, as it seems that this
1167 improves results (partially because of better alignments, partially
1168 because of some dark magic). */
1169 for (i = 1; 2 * i <= nunroll; i *= 2)
1170 continue;
1172 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1173 loop->lpt_decision.times = i - 1;
1176 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1178 while (cond)
1179 body;
1181 ==> (LOOP->LPT_DECISION.TIMES == 3)
1183 while (cond)
1185 body;
1186 if (!cond) break;
1187 body;
1188 if (!cond) break;
1189 body;
1190 if (!cond) break;
1191 body;
1194 static void
1195 unroll_loop_stupid (struct loop *loop)
1197 sbitmap wont_exit;
1198 unsigned nunroll = loop->lpt_decision.times;
1199 struct niter_desc *desc = get_simple_loop_desc (loop);
1200 struct opt_info *opt_info = NULL;
1201 bool ok;
1203 if (flag_split_ivs_in_unroller
1204 || flag_variable_expansion_in_unroller)
1205 opt_info = analyze_insns_in_loop (loop);
1208 wont_exit = sbitmap_alloc (nunroll + 1);
1209 bitmap_clear (wont_exit);
1210 opt_info_start_duplication (opt_info);
1212 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1213 nunroll, wont_exit,
1214 NULL, NULL,
1215 DLTHE_FLAG_UPDATE_FREQ
1216 | (opt_info
1217 ? DLTHE_RECORD_COPY_NUMBER
1218 : 0));
1219 gcc_assert (ok);
1221 if (opt_info)
1223 apply_opt_in_copies (opt_info, nunroll, true, true);
1224 free_opt_info (opt_info);
1227 free (wont_exit);
1229 if (desc->simple_p)
1231 /* We indeed may get here provided that there are nontrivial assumptions
1232 for a loop to be really simple. We could update the counts, but the
1233 problem is that we are unable to decide which exit will be taken
1234 (not really true in case the number of iterations is constant,
1235 but no one will do anything with this information, so we do not
1236 worry about it). */
1237 desc->simple_p = false;
1240 if (dump_file)
1241 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1242 nunroll, num_loop_insns (loop));
1245 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1246 Set *DEBUG_USES to the number of debug insns that reference the
1247 variable. */
1249 static bool
1250 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1251 int *debug_uses)
1253 basic_block *body, bb;
1254 unsigned i;
1255 int count_ref = 0;
1256 rtx_insn *insn;
1258 body = get_loop_body (loop);
1259 for (i = 0; i < loop->num_nodes; i++)
1261 bb = body[i];
1263 FOR_BB_INSNS (bb, insn)
1264 if (!rtx_referenced_p (reg, insn))
1265 continue;
1266 else if (DEBUG_INSN_P (insn))
1267 ++*debug_uses;
1268 else if (++count_ref > 1)
1269 break;
1271 free (body);
1272 return (count_ref == 1);
1275 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1277 static void
1278 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1280 basic_block *body, bb;
1281 unsigned i;
1282 rtx_insn *insn;
1284 body = get_loop_body (loop);
1285 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1287 bb = body[i];
1289 FOR_BB_INSNS (bb, insn)
1290 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1291 continue;
1292 else
1294 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1295 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1296 if (!--debug_uses)
1297 break;
1300 free (body);
1303 /* Determine whether INSN contains an accumulator
1304 which can be expanded into separate copies,
1305 one for each copy of the LOOP body.
1307 for (i = 0 ; i < n; i++)
1308 sum += a[i];
1312 sum += a[i]
1313 ....
1314 i = i+1;
1315 sum1 += a[i]
1316 ....
1317 i = i+1
1318 sum2 += a[i];
1319 ....
1321 Return NULL if INSN contains no opportunity for expansion of accumulator.
1322 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1323 information and return a pointer to it.
1326 static struct var_to_expand *
1327 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1329 rtx set, dest, src;
1330 struct var_to_expand *ves;
1331 unsigned accum_pos;
1332 enum rtx_code code;
1333 int debug_uses = 0;
1335 set = single_set (insn);
1336 if (!set)
1337 return NULL;
1339 dest = SET_DEST (set);
1340 src = SET_SRC (set);
1341 code = GET_CODE (src);
1343 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1344 return NULL;
1346 if (FLOAT_MODE_P (GET_MODE (dest)))
1348 if (!flag_associative_math)
1349 return NULL;
1350 /* In the case of FMA, we're also changing the rounding. */
1351 if (code == FMA && !flag_unsafe_math_optimizations)
1352 return NULL;
1355 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1356 in MD. But if there is no optab to generate the insn, we can not
1357 perform the variable expansion. This can happen if an MD provides
1358 an insn but not a named pattern to generate it, for example to avoid
1359 producing code that needs additional mode switches like for x87/mmx.
1361 So we check have_insn_for which looks for an optab for the operation
1362 in SRC. If it doesn't exist, we can't perform the expansion even
1363 though INSN is valid. */
1364 if (!have_insn_for (code, GET_MODE (src)))
1365 return NULL;
1367 if (!REG_P (dest)
1368 && !(GET_CODE (dest) == SUBREG
1369 && REG_P (SUBREG_REG (dest))))
1370 return NULL;
1372 /* Find the accumulator use within the operation. */
1373 if (code == FMA)
1375 /* We only support accumulation via FMA in the ADD position. */
1376 if (!rtx_equal_p (dest, XEXP (src, 2)))
1377 return NULL;
1378 accum_pos = 2;
1380 else if (rtx_equal_p (dest, XEXP (src, 0)))
1381 accum_pos = 0;
1382 else if (rtx_equal_p (dest, XEXP (src, 1)))
1384 /* The method of expansion that we are using; which includes the
1385 initialization of the expansions with zero and the summation of
1386 the expansions at the end of the computation will yield wrong
1387 results for (x = something - x) thus avoid using it in that case. */
1388 if (code == MINUS)
1389 return NULL;
1390 accum_pos = 1;
1392 else
1393 return NULL;
1395 /* It must not otherwise be used. */
1396 if (code == FMA)
1398 if (rtx_referenced_p (dest, XEXP (src, 0))
1399 || rtx_referenced_p (dest, XEXP (src, 1)))
1400 return NULL;
1402 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1403 return NULL;
1405 /* It must be used in exactly one insn. */
1406 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1407 return NULL;
1409 if (dump_file)
1411 fprintf (dump_file, "\n;; Expanding Accumulator ");
1412 print_rtl (dump_file, dest);
1413 fprintf (dump_file, "\n");
1416 if (debug_uses)
1417 /* Instead of resetting the debug insns, we could replace each
1418 debug use in the loop with the sum or product of all expanded
1419 accummulators. Since we'll only know of all expansions at the
1420 end, we'd have to keep track of which vars_to_expand a debug
1421 insn in the loop references, take note of each copy of the
1422 debug insn during unrolling, and when it's all done, compute
1423 the sum or product of each variable and adjust the original
1424 debug insn and each copy thereof. What a pain! */
1425 reset_debug_uses_in_loop (loop, dest, debug_uses);
1427 /* Record the accumulator to expand. */
1428 ves = XNEW (struct var_to_expand);
1429 ves->insn = insn;
1430 ves->reg = copy_rtx (dest);
1431 ves->var_expansions.create (1);
1432 ves->next = NULL;
1433 ves->op = GET_CODE (src);
1434 ves->expansion_count = 0;
1435 ves->reuse_expansion = 0;
1436 return ves;
1439 /* Determine whether there is an induction variable in INSN that
1440 we would like to split during unrolling.
1442 I.e. replace
1444 i = i + 1;
1446 i = i + 1;
1448 i = i + 1;
1451 type chains by
1453 i0 = i + 1
1455 i = i0 + 1
1457 i = i0 + 2
1460 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1461 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1462 pointer to it. */
1464 static struct iv_to_split *
1465 analyze_iv_to_split_insn (rtx_insn *insn)
1467 rtx set, dest;
1468 struct rtx_iv iv;
1469 struct iv_to_split *ivts;
1470 bool ok;
1472 /* For now we just split the basic induction variables. Later this may be
1473 extended for example by selecting also addresses of memory references. */
1474 set = single_set (insn);
1475 if (!set)
1476 return NULL;
1478 dest = SET_DEST (set);
1479 if (!REG_P (dest))
1480 return NULL;
1482 if (!biv_p (insn, dest))
1483 return NULL;
1485 ok = iv_analyze_result (insn, dest, &iv);
1487 /* This used to be an assert under the assumption that if biv_p returns
1488 true that iv_analyze_result must also return true. However, that
1489 assumption is not strictly correct as evidenced by pr25569.
1491 Returning NULL when iv_analyze_result returns false is safe and
1492 avoids the problems in pr25569 until the iv_analyze_* routines
1493 can be fixed, which is apparently hard and time consuming
1494 according to their author. */
1495 if (! ok)
1496 return NULL;
1498 if (iv.step == const0_rtx
1499 || iv.mode != iv.extend_mode)
1500 return NULL;
1502 /* Record the insn to split. */
1503 ivts = XNEW (struct iv_to_split);
1504 ivts->insn = insn;
1505 ivts->orig_var = dest;
1506 ivts->base_var = NULL_RTX;
1507 ivts->step = iv.step;
1508 ivts->next = NULL;
1510 return ivts;
1513 /* Determines which of insns in LOOP can be optimized.
1514 Return a OPT_INFO struct with the relevant hash tables filled
1515 with all insns to be optimized. The FIRST_NEW_BLOCK field
1516 is undefined for the return value. */
1518 static struct opt_info *
1519 analyze_insns_in_loop (struct loop *loop)
1521 basic_block *body, bb;
1522 unsigned i;
1523 struct opt_info *opt_info = XCNEW (struct opt_info);
1524 rtx_insn *insn;
1525 struct iv_to_split *ivts = NULL;
1526 struct var_to_expand *ves = NULL;
1527 iv_to_split **slot1;
1528 var_to_expand **slot2;
1529 vec<edge> edges = get_loop_exit_edges (loop);
1530 edge exit;
1531 bool can_apply = false;
1533 iv_analysis_loop_init (loop);
1535 body = get_loop_body (loop);
1537 if (flag_split_ivs_in_unroller)
1539 opt_info->insns_to_split
1540 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1541 opt_info->iv_to_split_head = NULL;
1542 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1545 /* Record the loop exit bb and loop preheader before the unrolling. */
1546 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1548 if (edges.length () == 1)
1550 exit = edges[0];
1551 if (!(exit->flags & EDGE_COMPLEX))
1553 opt_info->loop_exit = split_edge (exit);
1554 can_apply = true;
1558 if (flag_variable_expansion_in_unroller
1559 && can_apply)
1561 opt_info->insns_with_var_to_expand
1562 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1563 opt_info->var_to_expand_head = NULL;
1564 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1567 for (i = 0; i < loop->num_nodes; i++)
1569 bb = body[i];
1570 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1571 continue;
1573 FOR_BB_INSNS (bb, insn)
1575 if (!INSN_P (insn))
1576 continue;
1578 if (opt_info->insns_to_split)
1579 ivts = analyze_iv_to_split_insn (insn);
1581 if (ivts)
1583 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1584 gcc_assert (*slot1 == NULL);
1585 *slot1 = ivts;
1586 *opt_info->iv_to_split_tail = ivts;
1587 opt_info->iv_to_split_tail = &ivts->next;
1588 continue;
1591 if (opt_info->insns_with_var_to_expand)
1592 ves = analyze_insn_to_expand_var (loop, insn);
1594 if (ves)
1596 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1597 gcc_assert (*slot2 == NULL);
1598 *slot2 = ves;
1599 *opt_info->var_to_expand_tail = ves;
1600 opt_info->var_to_expand_tail = &ves->next;
1605 edges.release ();
1606 free (body);
1607 return opt_info;
1610 /* Called just before loop duplication. Records start of duplicated area
1611 to OPT_INFO. */
1613 static void
1614 opt_info_start_duplication (struct opt_info *opt_info)
1616 if (opt_info)
1617 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1620 /* Determine the number of iterations between initialization of the base
1621 variable and the current copy (N_COPY). N_COPIES is the total number
1622 of newly created copies. UNROLLING is true if we are unrolling
1623 (not peeling) the loop. */
1625 static unsigned
1626 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1628 if (unrolling)
1630 /* If we are unrolling, initialization is done in the original loop
1631 body (number 0). */
1632 return n_copy;
1634 else
1636 /* If we are peeling, the copy in that the initialization occurs has
1637 number 1. The original loop (number 0) is the last. */
1638 if (n_copy)
1639 return n_copy - 1;
1640 else
1641 return n_copies;
1645 /* Allocate basic variable for the induction variable chain. */
1647 static void
1648 allocate_basic_variable (struct iv_to_split *ivts)
1650 rtx expr = SET_SRC (single_set (ivts->insn));
1652 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1655 /* Insert initialization of basic variable of IVTS before INSN, taking
1656 the initial value from INSN. */
1658 static void
1659 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1661 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1662 rtx_insn *seq;
1664 start_sequence ();
1665 expr = force_operand (expr, ivts->base_var);
1666 if (expr != ivts->base_var)
1667 emit_move_insn (ivts->base_var, expr);
1668 seq = get_insns ();
1669 end_sequence ();
1671 emit_insn_before (seq, insn);
1674 /* Replace the use of induction variable described in IVTS in INSN
1675 by base variable + DELTA * step. */
1677 static void
1678 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1680 rtx expr, *loc, incr, var;
1681 rtx_insn *seq;
1682 machine_mode mode = GET_MODE (ivts->base_var);
1683 rtx src, dest, set;
1685 /* Construct base + DELTA * step. */
1686 if (!delta)
1687 expr = ivts->base_var;
1688 else
1690 incr = simplify_gen_binary (MULT, mode,
1691 ivts->step, gen_int_mode (delta, mode));
1692 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1693 ivts->base_var, incr);
1696 /* Figure out where to do the replacement. */
1697 loc = &SET_SRC (single_set (insn));
1699 /* If we can make the replacement right away, we're done. */
1700 if (validate_change (insn, loc, expr, 0))
1701 return;
1703 /* Otherwise, force EXPR into a register and try again. */
1704 start_sequence ();
1705 var = gen_reg_rtx (mode);
1706 expr = force_operand (expr, var);
1707 if (expr != var)
1708 emit_move_insn (var, expr);
1709 seq = get_insns ();
1710 end_sequence ();
1711 emit_insn_before (seq, insn);
1713 if (validate_change (insn, loc, var, 0))
1714 return;
1716 /* The last chance. Try recreating the assignment in insn
1717 completely from scratch. */
1718 set = single_set (insn);
1719 gcc_assert (set);
1721 start_sequence ();
1722 *loc = var;
1723 src = copy_rtx (SET_SRC (set));
1724 dest = copy_rtx (SET_DEST (set));
1725 src = force_operand (src, dest);
1726 if (src != dest)
1727 emit_move_insn (dest, src);
1728 seq = get_insns ();
1729 end_sequence ();
1731 emit_insn_before (seq, insn);
1732 delete_insn (insn);
1736 /* Return one expansion of the accumulator recorded in struct VE. */
1738 static rtx
1739 get_expansion (struct var_to_expand *ve)
1741 rtx reg;
1743 if (ve->reuse_expansion == 0)
1744 reg = ve->reg;
1745 else
1746 reg = ve->var_expansions[ve->reuse_expansion - 1];
1748 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1749 ve->reuse_expansion = 0;
1750 else
1751 ve->reuse_expansion++;
1753 return reg;
1757 /* Given INSN replace the uses of the accumulator recorded in VE
1758 with a new register. */
1760 static void
1761 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1763 rtx new_reg, set;
1764 bool really_new_expansion = false;
1766 set = single_set (insn);
1767 gcc_assert (set);
1769 /* Generate a new register only if the expansion limit has not been
1770 reached. Else reuse an already existing expansion. */
1771 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1773 really_new_expansion = true;
1774 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1776 else
1777 new_reg = get_expansion (ve);
1779 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1780 if (apply_change_group ())
1781 if (really_new_expansion)
1783 ve->var_expansions.safe_push (new_reg);
1784 ve->expansion_count++;
1788 /* Initialize the variable expansions in loop preheader. PLACE is the
1789 loop-preheader basic block where the initialization of the
1790 expansions should take place. The expansions are initialized with
1791 (-0) when the operation is plus or minus to honor sign zero. This
1792 way we can prevent cases where the sign of the final result is
1793 effected by the sign of the expansion. Here is an example to
1794 demonstrate this:
1796 for (i = 0 ; i < n; i++)
1797 sum += something;
1801 sum += something
1802 ....
1803 i = i+1;
1804 sum1 += something
1805 ....
1806 i = i+1
1807 sum2 += something;
1808 ....
1810 When SUM is initialized with -zero and SOMETHING is also -zero; the
1811 final result of sum should be -zero thus the expansions sum1 and sum2
1812 should be initialized with -zero as well (otherwise we will get +zero
1813 as the final result). */
1815 static void
1816 insert_var_expansion_initialization (struct var_to_expand *ve,
1817 basic_block place)
1819 rtx_insn *seq;
1820 rtx var, zero_init;
1821 unsigned i;
1822 machine_mode mode = GET_MODE (ve->reg);
1823 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1825 if (ve->var_expansions.length () == 0)
1826 return;
1828 start_sequence ();
1829 switch (ve->op)
1831 case FMA:
1832 /* Note that we only accumulate FMA via the ADD operand. */
1833 case PLUS:
1834 case MINUS:
1835 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1837 if (honor_signed_zero_p)
1838 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1839 else
1840 zero_init = CONST0_RTX (mode);
1841 emit_move_insn (var, zero_init);
1843 break;
1845 case MULT:
1846 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1848 zero_init = CONST1_RTX (GET_MODE (var));
1849 emit_move_insn (var, zero_init);
1851 break;
1853 default:
1854 gcc_unreachable ();
1857 seq = get_insns ();
1858 end_sequence ();
1860 emit_insn_after (seq, BB_END (place));
1863 /* Combine the variable expansions at the loop exit. PLACE is the
1864 loop exit basic block where the summation of the expansions should
1865 take place. */
1867 static void
1868 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1870 rtx sum = ve->reg;
1871 rtx expr, var;
1872 rtx_insn *seq, *insn;
1873 unsigned i;
1875 if (ve->var_expansions.length () == 0)
1876 return;
1878 start_sequence ();
1879 switch (ve->op)
1881 case FMA:
1882 /* Note that we only accumulate FMA via the ADD operand. */
1883 case PLUS:
1884 case MINUS:
1885 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1886 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1887 break;
1889 case MULT:
1890 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1891 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1892 break;
1894 default:
1895 gcc_unreachable ();
1898 expr = force_operand (sum, ve->reg);
1899 if (expr != ve->reg)
1900 emit_move_insn (ve->reg, expr);
1901 seq = get_insns ();
1902 end_sequence ();
1904 insn = BB_HEAD (place);
1905 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1906 insn = NEXT_INSN (insn);
1908 emit_insn_after (seq, insn);
1911 /* Strip away REG_EQUAL notes for IVs we're splitting.
1913 Updating REG_EQUAL notes for IVs we split is tricky: We
1914 cannot tell until after unrolling, DF-rescanning, and liveness
1915 updating, whether an EQ_USE is reached by the split IV while
1916 the IV reg is still live. See PR55006.
1918 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1919 because RTL loop-iv requires us to defer rescanning insns and
1920 any notes attached to them. So resort to old techniques... */
1922 static void
1923 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1925 struct iv_to_split *ivts;
1926 rtx note = find_reg_equal_equiv_note (insn);
1927 if (! note)
1928 return;
1929 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1930 if (reg_mentioned_p (ivts->orig_var, note))
1932 remove_note (insn, note);
1933 return;
1937 /* Apply loop optimizations in loop copies using the
1938 data which gathered during the unrolling. Structure
1939 OPT_INFO record that data.
1941 UNROLLING is true if we unrolled (not peeled) the loop.
1942 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1943 the loop (as it should happen in complete unrolling, but not in ordinary
1944 peeling of the loop). */
1946 static void
1947 apply_opt_in_copies (struct opt_info *opt_info,
1948 unsigned n_copies, bool unrolling,
1949 bool rewrite_original_loop)
1951 unsigned i, delta;
1952 basic_block bb, orig_bb;
1953 rtx_insn *insn, *orig_insn, *next;
1954 struct iv_to_split ivts_templ, *ivts;
1955 struct var_to_expand ve_templ, *ves;
1957 /* Sanity check -- we need to put initialization in the original loop
1958 body. */
1959 gcc_assert (!unrolling || rewrite_original_loop);
1961 /* Allocate the basic variables (i0). */
1962 if (opt_info->insns_to_split)
1963 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1964 allocate_basic_variable (ivts);
1966 for (i = opt_info->first_new_block;
1967 i < (unsigned) last_basic_block_for_fn (cfun);
1968 i++)
1970 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1971 orig_bb = get_bb_original (bb);
1973 /* bb->aux holds position in copy sequence initialized by
1974 duplicate_loop_to_header_edge. */
1975 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1976 unrolling);
1977 bb->aux = 0;
1978 orig_insn = BB_HEAD (orig_bb);
1979 FOR_BB_INSNS_SAFE (bb, insn, next)
1981 if (!INSN_P (insn)
1982 || (DEBUG_INSN_P (insn)
1983 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1984 continue;
1986 while (!INSN_P (orig_insn)
1987 || (DEBUG_INSN_P (orig_insn)
1988 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1989 == LABEL_DECL)))
1990 orig_insn = NEXT_INSN (orig_insn);
1992 ivts_templ.insn = orig_insn;
1993 ve_templ.insn = orig_insn;
1995 /* Apply splitting iv optimization. */
1996 if (opt_info->insns_to_split)
1998 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2000 ivts = opt_info->insns_to_split->find (&ivts_templ);
2002 if (ivts)
2004 gcc_assert (GET_CODE (PATTERN (insn))
2005 == GET_CODE (PATTERN (orig_insn)));
2007 if (!delta)
2008 insert_base_initialization (ivts, insn);
2009 split_iv (ivts, insn, delta);
2012 /* Apply variable expansion optimization. */
2013 if (unrolling && opt_info->insns_with_var_to_expand)
2015 ves = (struct var_to_expand *)
2016 opt_info->insns_with_var_to_expand->find (&ve_templ);
2017 if (ves)
2019 gcc_assert (GET_CODE (PATTERN (insn))
2020 == GET_CODE (PATTERN (orig_insn)));
2021 expand_var_during_unrolling (ves, insn);
2024 orig_insn = NEXT_INSN (orig_insn);
2028 if (!rewrite_original_loop)
2029 return;
2031 /* Initialize the variable expansions in the loop preheader
2032 and take care of combining them at the loop exit. */
2033 if (opt_info->insns_with_var_to_expand)
2035 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2036 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2037 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2038 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2041 /* Rewrite also the original loop body. Find them as originals of the blocks
2042 in the last copied iteration, i.e. those that have
2043 get_bb_copy (get_bb_original (bb)) == bb. */
2044 for (i = opt_info->first_new_block;
2045 i < (unsigned) last_basic_block_for_fn (cfun);
2046 i++)
2048 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2049 orig_bb = get_bb_original (bb);
2050 if (get_bb_copy (orig_bb) != bb)
2051 continue;
2053 delta = determine_split_iv_delta (0, n_copies, unrolling);
2054 for (orig_insn = BB_HEAD (orig_bb);
2055 orig_insn != NEXT_INSN (BB_END (bb));
2056 orig_insn = next)
2058 next = NEXT_INSN (orig_insn);
2060 if (!INSN_P (orig_insn))
2061 continue;
2063 ivts_templ.insn = orig_insn;
2064 if (opt_info->insns_to_split)
2066 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2068 ivts = (struct iv_to_split *)
2069 opt_info->insns_to_split->find (&ivts_templ);
2070 if (ivts)
2072 if (!delta)
2073 insert_base_initialization (ivts, orig_insn);
2074 split_iv (ivts, orig_insn, delta);
2075 continue;
2083 /* Release OPT_INFO. */
2085 static void
2086 free_opt_info (struct opt_info *opt_info)
2088 delete opt_info->insns_to_split;
2089 opt_info->insns_to_split = NULL;
2090 if (opt_info->insns_with_var_to_expand)
2092 struct var_to_expand *ves;
2094 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2095 ves->var_expansions.release ();
2096 delete opt_info->insns_with_var_to_expand;
2097 opt_info->insns_with_var_to_expand = NULL;
2099 free (opt_info);