* include/std/future (__location_invariant): Move specializations
[official-gcc.git] / gcc / loop-unroll.c
blob707ffff70b066c1bf28427ddcb37527ad8970bbb
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "basic-block.h"
29 #include "cfgloop.h"
30 #include "params.h"
31 #include "expr.h"
32 #include "hash-table.h"
33 #include "recog.h"
34 #include "target.h"
35 #include "dumpfile.h"
37 /* This pass performs loop unrolling and peeling. We only perform these
38 optimizations on innermost loops (with single exception) because
39 the impact on performance is greatest here, and we want to avoid
40 unnecessary code size growth. The gain is caused by greater sequentiality
41 of code, better code to optimize for further passes and in some cases
42 by fewer testings of exit conditions. The main problem is code growth,
43 that impacts performance negatively due to effect of caches.
45 What we do:
47 -- complete peeling of once-rolling loops; this is the above mentioned
48 exception, as this causes loop to be cancelled completely and
49 does not cause code growth
50 -- complete peeling of loops that roll (small) constant times.
51 -- simple peeling of first iterations of loops that do not roll much
52 (according to profile feedback)
53 -- unrolling of loops that roll constant times; this is almost always
54 win, as we get rid of exit condition tests.
55 -- unrolling of loops that roll number of times that we can compute
56 in runtime; we also get rid of exit condition tests here, but there
57 is the extra expense for calculating the number of iterations
58 -- simple unrolling of remaining loops; this is performed only if we
59 are asked to, as the gain is questionable in this case and often
60 it may even slow down the code
61 For more detailed descriptions of each of those, see comments at
62 appropriate function below.
64 There is a lot of parameters (defined and described in params.def) that
65 control how much we unroll/peel.
67 ??? A great problem is that we don't have a good way how to determine
68 how many times we should unroll the loop; the experiments I have made
69 showed that this choice may affect performance in order of several %.
72 /* Information about induction variables to split. */
74 struct iv_to_split
76 rtx insn; /* The insn in that the induction variable occurs. */
77 rtx orig_var; /* The variable (register) for the IV before split. */
78 rtx base_var; /* The variable on that the values in the further
79 iterations are based. */
80 rtx step; /* Step of the induction variable. */
81 struct iv_to_split *next; /* Next entry in walking order. */
84 /* Information about accumulators to expand. */
86 struct var_to_expand
88 rtx insn; /* The insn in that the variable expansion occurs. */
89 rtx reg; /* The accumulator which is expanded. */
90 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
91 struct var_to_expand *next; /* Next entry in walking order. */
92 enum rtx_code op; /* The type of the accumulation - addition, subtraction
93 or multiplication. */
94 int expansion_count; /* Count the number of expansions generated so far. */
95 int reuse_expansion; /* The expansion we intend to reuse to expand
96 the accumulator. If REUSE_EXPANSION is 0 reuse
97 the original accumulator. Else use
98 var_expansions[REUSE_EXPANSION - 1]. */
101 /* Hashtable helper for iv_to_split. */
103 struct iv_split_hasher : typed_free_remove <iv_to_split>
105 typedef iv_to_split value_type;
106 typedef iv_to_split compare_type;
107 static inline hashval_t hash (const value_type *);
108 static inline bool equal (const value_type *, const compare_type *);
112 /* A hash function for information about insns to split. */
114 inline hashval_t
115 iv_split_hasher::hash (const value_type *ivts)
117 return (hashval_t) INSN_UID (ivts->insn);
120 /* An equality functions for information about insns to split. */
122 inline bool
123 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
125 return i1->insn == i2->insn;
128 /* Hashtable helper for iv_to_split. */
130 struct var_expand_hasher : typed_free_remove <var_to_expand>
132 typedef var_to_expand value_type;
133 typedef var_to_expand compare_type;
134 static inline hashval_t hash (const value_type *);
135 static inline bool equal (const value_type *, const compare_type *);
138 /* Return a hash for VES. */
140 inline hashval_t
141 var_expand_hasher::hash (const value_type *ves)
143 return (hashval_t) INSN_UID (ves->insn);
146 /* Return true if I1 and I2 refer to the same instruction. */
148 inline bool
149 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
151 return i1->insn == i2->insn;
154 /* Information about optimization applied in
155 the unrolled loop. */
157 struct opt_info
159 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
160 split. */
161 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
162 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
163 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
164 insns with accumulators to expand. */
165 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
166 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
167 unsigned first_new_block; /* The first basic block that was
168 duplicated. */
169 basic_block loop_exit; /* The loop exit basic block. */
170 basic_block loop_preheader; /* The loop preheader basic block. */
173 static void decide_unrolling_and_peeling (int);
174 static void peel_loops_completely (int);
175 static void decide_peel_simple (struct loop *, int);
176 static void decide_peel_once_rolling (struct loop *, int);
177 static void decide_peel_completely (struct loop *, int);
178 static void decide_unroll_stupid (struct loop *, int);
179 static void decide_unroll_constant_iterations (struct loop *, int);
180 static void decide_unroll_runtime_iterations (struct loop *, int);
181 static void peel_loop_simple (struct loop *);
182 static void peel_loop_completely (struct loop *);
183 static void unroll_loop_stupid (struct loop *);
184 static void unroll_loop_constant_iterations (struct loop *);
185 static void unroll_loop_runtime_iterations (struct loop *);
186 static struct opt_info *analyze_insns_in_loop (struct loop *);
187 static void opt_info_start_duplication (struct opt_info *);
188 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
189 static void free_opt_info (struct opt_info *);
190 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
191 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
192 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
193 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
194 static void insert_var_expansion_initialization (struct var_to_expand *,
195 basic_block);
196 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
197 basic_block);
198 static rtx get_expansion (struct var_to_expand *);
200 /* Emit a message summarizing the unroll or peel that will be
201 performed for LOOP, along with the loop's location LOCUS, if
202 appropriate given the dump or -fopt-info settings. */
204 static void
205 report_unroll_peel (struct loop *loop, location_t locus)
207 struct niter_desc *desc;
208 int niters = 0;
209 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
211 if (loop->lpt_decision.decision == LPT_NONE)
212 return;
214 if (!dump_enabled_p ())
215 return;
217 /* In the special case where the loop never iterated, emit
218 a different message so that we don't report an unroll by 0.
219 This matches the equivalent message emitted during tree unrolling. */
220 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
221 && !loop->lpt_decision.times)
223 dump_printf_loc (report_flags, locus,
224 "loop turned into non-loop; it never loops.\n");
225 return;
228 desc = get_simple_loop_desc (loop);
230 if (desc->const_iter)
231 niters = desc->niter;
232 else if (loop->header->count)
233 niters = expected_loop_iterations (loop);
235 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
236 dump_printf_loc (report_flags, locus,
237 "loop with %d iterations completely unrolled",
238 loop->lpt_decision.times + 1);
239 else
240 dump_printf_loc (report_flags, locus,
241 "loop %s %d times",
242 (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
243 ? "peeled" : "unrolled"),
244 loop->lpt_decision.times);
245 if (profile_info)
246 dump_printf (report_flags,
247 " (header execution count %d",
248 (int)loop->header->count);
249 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
250 dump_printf (report_flags,
251 "%s%s iterations %d)",
252 profile_info ? ", " : " (",
253 desc->const_iter ? "const" : "average",
254 niters);
255 else if (profile_info)
256 dump_printf (report_flags, ")");
258 dump_printf (report_flags, "\n");
261 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
262 void
263 unroll_and_peel_loops (int flags)
265 struct loop *loop;
266 bool changed = false;
268 /* First perform complete loop peeling (it is almost surely a win,
269 and affects parameters for further decision a lot). */
270 peel_loops_completely (flags);
272 /* Now decide rest of unrolling and peeling. */
273 decide_unrolling_and_peeling (flags);
275 /* Scan the loops, inner ones first. */
276 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
278 /* And perform the appropriate transformations. */
279 switch (loop->lpt_decision.decision)
281 case LPT_PEEL_COMPLETELY:
282 /* Already done. */
283 gcc_unreachable ();
284 case LPT_PEEL_SIMPLE:
285 peel_loop_simple (loop);
286 changed = true;
287 break;
288 case LPT_UNROLL_CONSTANT:
289 unroll_loop_constant_iterations (loop);
290 changed = true;
291 break;
292 case LPT_UNROLL_RUNTIME:
293 unroll_loop_runtime_iterations (loop);
294 changed = true;
295 break;
296 case LPT_UNROLL_STUPID:
297 unroll_loop_stupid (loop);
298 changed = true;
299 break;
300 case LPT_NONE:
301 break;
302 default:
303 gcc_unreachable ();
307 if (changed)
309 calculate_dominance_info (CDI_DOMINATORS);
310 fix_loop_structure (NULL);
313 iv_analysis_done ();
316 /* Check whether exit of the LOOP is at the end of loop body. */
318 static bool
319 loop_exit_at_end_p (struct loop *loop)
321 struct niter_desc *desc = get_simple_loop_desc (loop);
322 rtx insn;
324 if (desc->in_edge->dest != loop->latch)
325 return false;
327 /* Check that the latch is empty. */
328 FOR_BB_INSNS (loop->latch, insn)
330 if (NONDEBUG_INSN_P (insn))
331 return false;
334 return true;
337 /* Depending on FLAGS, check whether to peel loops completely and do so. */
338 static void
339 peel_loops_completely (int flags)
341 struct loop *loop;
342 bool changed = false;
344 /* Scan the loops, the inner ones first. */
345 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
347 loop->lpt_decision.decision = LPT_NONE;
348 location_t locus = get_loop_location (loop);
350 if (dump_enabled_p ())
351 dump_printf_loc (TDF_RTL, locus,
352 ";; *** Considering loop %d at BB %d for "
353 "complete peeling ***\n",
354 loop->num, loop->header->index);
356 loop->ninsns = num_loop_insns (loop);
358 decide_peel_once_rolling (loop, flags);
359 if (loop->lpt_decision.decision == LPT_NONE)
360 decide_peel_completely (loop, flags);
362 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
364 report_unroll_peel (loop, locus);
365 peel_loop_completely (loop);
366 changed = true;
370 if (changed)
372 calculate_dominance_info (CDI_DOMINATORS);
373 fix_loop_structure (NULL);
377 /* Decide whether unroll or peel loops (depending on FLAGS) and how much. */
378 static void
379 decide_unrolling_and_peeling (int flags)
381 struct loop *loop;
383 /* Scan the loops, inner ones first. */
384 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
386 loop->lpt_decision.decision = LPT_NONE;
387 location_t locus = get_loop_location (loop);
389 if (dump_enabled_p ())
390 dump_printf_loc (TDF_RTL, locus,
391 ";; *** Considering loop %d at BB %d for "
392 "unrolling and peeling ***\n",
393 loop->num, loop->header->index);
395 /* Do not peel cold areas. */
396 if (optimize_loop_for_size_p (loop))
398 if (dump_file)
399 fprintf (dump_file, ";; Not considering loop, cold area\n");
400 continue;
403 /* Can the loop be manipulated? */
404 if (!can_duplicate_loop_p (loop))
406 if (dump_file)
407 fprintf (dump_file,
408 ";; Not considering loop, cannot duplicate\n");
409 continue;
412 /* Skip non-innermost loops. */
413 if (loop->inner)
415 if (dump_file)
416 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
417 continue;
420 loop->ninsns = num_loop_insns (loop);
421 loop->av_ninsns = average_num_loop_insns (loop);
423 /* Try transformations one by one in decreasing order of
424 priority. */
426 decide_unroll_constant_iterations (loop, flags);
427 if (loop->lpt_decision.decision == LPT_NONE)
428 decide_unroll_runtime_iterations (loop, flags);
429 if (loop->lpt_decision.decision == LPT_NONE)
430 decide_unroll_stupid (loop, flags);
431 if (loop->lpt_decision.decision == LPT_NONE)
432 decide_peel_simple (loop, flags);
434 report_unroll_peel (loop, locus);
438 /* Decide whether the LOOP is once rolling and suitable for complete
439 peeling. */
440 static void
441 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
443 struct niter_desc *desc;
445 if (dump_file)
446 fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
448 /* Is the loop small enough? */
449 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
451 if (dump_file)
452 fprintf (dump_file, ";; Not considering loop, is too big\n");
453 return;
456 /* Check for simple loops. */
457 desc = get_simple_loop_desc (loop);
459 /* Check number of iterations. */
460 if (!desc->simple_p
461 || desc->assumptions
462 || desc->infinite
463 || !desc->const_iter
464 || (desc->niter != 0
465 && get_max_loop_iterations_int (loop) != 0))
467 if (dump_file)
468 fprintf (dump_file,
469 ";; Unable to prove that the loop rolls exactly once\n");
470 return;
473 /* Success. */
474 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
477 /* Decide whether the LOOP is suitable for complete peeling. */
478 static void
479 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
481 unsigned npeel;
482 struct niter_desc *desc;
484 if (dump_file)
485 fprintf (dump_file, "\n;; Considering peeling completely\n");
487 /* Skip non-innermost loops. */
488 if (loop->inner)
490 if (dump_file)
491 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
492 return;
495 /* Do not peel cold areas. */
496 if (optimize_loop_for_size_p (loop))
498 if (dump_file)
499 fprintf (dump_file, ";; Not considering loop, cold area\n");
500 return;
503 /* Can the loop be manipulated? */
504 if (!can_duplicate_loop_p (loop))
506 if (dump_file)
507 fprintf (dump_file,
508 ";; Not considering loop, cannot duplicate\n");
509 return;
512 /* npeel = number of iterations to peel. */
513 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
514 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
515 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
517 /* Is the loop small enough? */
518 if (!npeel)
520 if (dump_file)
521 fprintf (dump_file, ";; Not considering loop, is too big\n");
522 return;
525 /* Check for simple loops. */
526 desc = get_simple_loop_desc (loop);
528 /* Check number of iterations. */
529 if (!desc->simple_p
530 || desc->assumptions
531 || !desc->const_iter
532 || desc->infinite)
534 if (dump_file)
535 fprintf (dump_file,
536 ";; Unable to prove that the loop iterates constant times\n");
537 return;
540 if (desc->niter > npeel - 1)
542 if (dump_file)
544 fprintf (dump_file,
545 ";; Not peeling loop completely, rolls too much (");
546 fprintf (dump_file, "%"PRId64, desc->niter);
547 fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
549 return;
552 /* Success. */
553 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
556 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
557 completely. The transformation done:
559 for (i = 0; i < 4; i++)
560 body;
564 i = 0;
565 body; i++;
566 body; i++;
567 body; i++;
568 body; i++;
570 static void
571 peel_loop_completely (struct loop *loop)
573 sbitmap wont_exit;
574 unsigned HOST_WIDE_INT npeel;
575 unsigned i;
576 edge ein;
577 struct niter_desc *desc = get_simple_loop_desc (loop);
578 struct opt_info *opt_info = NULL;
580 npeel = desc->niter;
582 if (npeel)
584 bool ok;
586 wont_exit = sbitmap_alloc (npeel + 1);
587 bitmap_ones (wont_exit);
588 bitmap_clear_bit (wont_exit, 0);
589 if (desc->noloop_assumptions)
590 bitmap_clear_bit (wont_exit, 1);
592 auto_vec<edge> remove_edges;
593 if (flag_split_ivs_in_unroller)
594 opt_info = analyze_insns_in_loop (loop);
596 opt_info_start_duplication (opt_info);
597 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
598 npeel,
599 wont_exit, desc->out_edge,
600 &remove_edges,
601 DLTHE_FLAG_UPDATE_FREQ
602 | DLTHE_FLAG_COMPLETTE_PEEL
603 | (opt_info
604 ? DLTHE_RECORD_COPY_NUMBER : 0));
605 gcc_assert (ok);
607 free (wont_exit);
609 if (opt_info)
611 apply_opt_in_copies (opt_info, npeel, false, true);
612 free_opt_info (opt_info);
615 /* Remove the exit edges. */
616 FOR_EACH_VEC_ELT (remove_edges, i, ein)
617 remove_path (ein);
620 ein = desc->in_edge;
621 free_simple_loop_desc (loop);
623 /* Now remove the unreachable part of the last iteration and cancel
624 the loop. */
625 remove_path (ein);
627 if (dump_file)
628 fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
631 /* Decide whether to unroll LOOP iterating constant number of times
632 and how much. */
634 static void
635 decide_unroll_constant_iterations (struct loop *loop, int flags)
637 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
638 struct niter_desc *desc;
639 widest_int iterations;
641 if (!(flags & UAP_UNROLL))
643 /* We were not asked to, just return back silently. */
644 return;
647 if (dump_file)
648 fprintf (dump_file,
649 "\n;; Considering unrolling loop with constant "
650 "number of iterations\n");
652 /* nunroll = total number of copies of the original loop body in
653 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
654 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
655 nunroll_by_av
656 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
657 if (nunroll > nunroll_by_av)
658 nunroll = nunroll_by_av;
659 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
660 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
662 if (targetm.loop_unroll_adjust)
663 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
665 /* Skip big loops. */
666 if (nunroll <= 1)
668 if (dump_file)
669 fprintf (dump_file, ";; Not considering loop, is too big\n");
670 return;
673 /* Check for simple loops. */
674 desc = get_simple_loop_desc (loop);
676 /* Check number of iterations. */
677 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
679 if (dump_file)
680 fprintf (dump_file,
681 ";; Unable to prove that the loop iterates constant times\n");
682 return;
685 /* Check whether the loop rolls enough to consider.
686 Consult also loop bounds and profile; in the case the loop has more
687 than one exit it may well loop less than determined maximal number
688 of iterations. */
689 if (desc->niter < 2 * nunroll
690 || ((get_estimated_loop_iterations (loop, &iterations)
691 || get_max_loop_iterations (loop, &iterations))
692 && wi::ltu_p (iterations, 2 * nunroll)))
694 if (dump_file)
695 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
696 return;
699 /* Success; now compute number of iterations to unroll. We alter
700 nunroll so that as few as possible copies of loop body are
701 necessary, while still not decreasing the number of unrollings
702 too much (at most by 1). */
703 best_copies = 2 * nunroll + 10;
705 i = 2 * nunroll + 2;
706 if (i - 1 >= desc->niter)
707 i = desc->niter - 2;
709 for (; i >= nunroll - 1; i--)
711 unsigned exit_mod = desc->niter % (i + 1);
713 if (!loop_exit_at_end_p (loop))
714 n_copies = exit_mod + i + 1;
715 else if (exit_mod != (unsigned) i
716 || desc->noloop_assumptions != NULL_RTX)
717 n_copies = exit_mod + i + 2;
718 else
719 n_copies = i + 1;
721 if (n_copies < best_copies)
723 best_copies = n_copies;
724 best_unroll = i;
728 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
729 loop->lpt_decision.times = best_unroll;
732 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
733 The transformation does this:
735 for (i = 0; i < 102; i++)
736 body;
738 ==> (LOOP->LPT_DECISION.TIMES == 3)
740 i = 0;
741 body; i++;
742 body; i++;
743 while (i < 102)
745 body; i++;
746 body; i++;
747 body; i++;
748 body; i++;
751 static void
752 unroll_loop_constant_iterations (struct loop *loop)
754 unsigned HOST_WIDE_INT niter;
755 unsigned exit_mod;
756 sbitmap wont_exit;
757 unsigned i;
758 edge e;
759 unsigned max_unroll = loop->lpt_decision.times;
760 struct niter_desc *desc = get_simple_loop_desc (loop);
761 bool exit_at_end = loop_exit_at_end_p (loop);
762 struct opt_info *opt_info = NULL;
763 bool ok;
765 niter = desc->niter;
767 /* Should not get here (such loop should be peeled instead). */
768 gcc_assert (niter > max_unroll + 1);
770 exit_mod = niter % (max_unroll + 1);
772 wont_exit = sbitmap_alloc (max_unroll + 1);
773 bitmap_ones (wont_exit);
775 auto_vec<edge> remove_edges;
776 if (flag_split_ivs_in_unroller
777 || flag_variable_expansion_in_unroller)
778 opt_info = analyze_insns_in_loop (loop);
780 if (!exit_at_end)
782 /* The exit is not at the end of the loop; leave exit test
783 in the first copy, so that the loops that start with test
784 of exit condition have continuous body after unrolling. */
786 if (dump_file)
787 fprintf (dump_file, ";; Condition at beginning of loop.\n");
789 /* Peel exit_mod iterations. */
790 bitmap_clear_bit (wont_exit, 0);
791 if (desc->noloop_assumptions)
792 bitmap_clear_bit (wont_exit, 1);
794 if (exit_mod)
796 opt_info_start_duplication (opt_info);
797 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
798 exit_mod,
799 wont_exit, desc->out_edge,
800 &remove_edges,
801 DLTHE_FLAG_UPDATE_FREQ
802 | (opt_info && exit_mod > 1
803 ? DLTHE_RECORD_COPY_NUMBER
804 : 0));
805 gcc_assert (ok);
807 if (opt_info && exit_mod > 1)
808 apply_opt_in_copies (opt_info, exit_mod, false, false);
810 desc->noloop_assumptions = NULL_RTX;
811 desc->niter -= exit_mod;
812 loop->nb_iterations_upper_bound -= exit_mod;
813 if (loop->any_estimate
814 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
815 loop->nb_iterations_estimate -= exit_mod;
816 else
817 loop->any_estimate = false;
820 bitmap_set_bit (wont_exit, 1);
822 else
824 /* Leave exit test in last copy, for the same reason as above if
825 the loop tests the condition at the end of loop body. */
827 if (dump_file)
828 fprintf (dump_file, ";; Condition at end of loop.\n");
830 /* We know that niter >= max_unroll + 2; so we do not need to care of
831 case when we would exit before reaching the loop. So just peel
832 exit_mod + 1 iterations. */
833 if (exit_mod != max_unroll
834 || desc->noloop_assumptions)
836 bitmap_clear_bit (wont_exit, 0);
837 if (desc->noloop_assumptions)
838 bitmap_clear_bit (wont_exit, 1);
840 opt_info_start_duplication (opt_info);
841 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
842 exit_mod + 1,
843 wont_exit, desc->out_edge,
844 &remove_edges,
845 DLTHE_FLAG_UPDATE_FREQ
846 | (opt_info && exit_mod > 0
847 ? DLTHE_RECORD_COPY_NUMBER
848 : 0));
849 gcc_assert (ok);
851 if (opt_info && exit_mod > 0)
852 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
854 desc->niter -= exit_mod + 1;
855 loop->nb_iterations_upper_bound -= exit_mod + 1;
856 if (loop->any_estimate
857 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
858 loop->nb_iterations_estimate -= exit_mod + 1;
859 else
860 loop->any_estimate = false;
861 desc->noloop_assumptions = NULL_RTX;
863 bitmap_set_bit (wont_exit, 0);
864 bitmap_set_bit (wont_exit, 1);
867 bitmap_clear_bit (wont_exit, max_unroll);
870 /* Now unroll the loop. */
872 opt_info_start_duplication (opt_info);
873 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
874 max_unroll,
875 wont_exit, desc->out_edge,
876 &remove_edges,
877 DLTHE_FLAG_UPDATE_FREQ
878 | (opt_info
879 ? DLTHE_RECORD_COPY_NUMBER
880 : 0));
881 gcc_assert (ok);
883 if (opt_info)
885 apply_opt_in_copies (opt_info, max_unroll, true, true);
886 free_opt_info (opt_info);
889 free (wont_exit);
891 if (exit_at_end)
893 basic_block exit_block = get_bb_copy (desc->in_edge->src);
894 /* Find a new in and out edge; they are in the last copy we have made. */
896 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
898 desc->out_edge = EDGE_SUCC (exit_block, 0);
899 desc->in_edge = EDGE_SUCC (exit_block, 1);
901 else
903 desc->out_edge = EDGE_SUCC (exit_block, 1);
904 desc->in_edge = EDGE_SUCC (exit_block, 0);
908 desc->niter /= max_unroll + 1;
909 loop->nb_iterations_upper_bound
910 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
911 if (loop->any_estimate)
912 loop->nb_iterations_estimate
913 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
914 desc->niter_expr = GEN_INT (desc->niter);
916 /* Remove the edges. */
917 FOR_EACH_VEC_ELT (remove_edges, i, e)
918 remove_path (e);
920 if (dump_file)
921 fprintf (dump_file,
922 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
923 max_unroll, num_loop_insns (loop));
926 /* Decide whether to unroll LOOP iterating runtime computable number of times
927 and how much. */
928 static void
929 decide_unroll_runtime_iterations (struct loop *loop, int flags)
931 unsigned nunroll, nunroll_by_av, i;
932 struct niter_desc *desc;
933 widest_int iterations;
935 if (!(flags & UAP_UNROLL))
937 /* We were not asked to, just return back silently. */
938 return;
941 if (dump_file)
942 fprintf (dump_file,
943 "\n;; Considering unrolling loop with runtime "
944 "computable number of iterations\n");
946 /* nunroll = total number of copies of the original loop body in
947 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
948 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
949 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
950 if (nunroll > nunroll_by_av)
951 nunroll = nunroll_by_av;
952 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
953 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
955 if (targetm.loop_unroll_adjust)
956 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
958 /* Skip big loops. */
959 if (nunroll <= 1)
961 if (dump_file)
962 fprintf (dump_file, ";; Not considering loop, is too big\n");
963 return;
966 /* Check for simple loops. */
967 desc = get_simple_loop_desc (loop);
969 /* Check simpleness. */
970 if (!desc->simple_p || desc->assumptions)
972 if (dump_file)
973 fprintf (dump_file,
974 ";; Unable to prove that the number of iterations "
975 "can be counted in runtime\n");
976 return;
979 if (desc->const_iter)
981 if (dump_file)
982 fprintf (dump_file, ";; Loop iterates constant times\n");
983 return;
986 /* Check whether the loop rolls. */
987 if ((get_estimated_loop_iterations (loop, &iterations)
988 || get_max_loop_iterations (loop, &iterations))
989 && wi::ltu_p (iterations, 2 * nunroll))
991 if (dump_file)
992 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
993 return;
996 /* Success; now force nunroll to be power of 2, as we are unable to
997 cope with overflows in computation of number of iterations. */
998 for (i = 1; 2 * i <= nunroll; i *= 2)
999 continue;
1001 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1002 loop->lpt_decision.times = i - 1;
1005 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1006 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
1007 and NULL is returned instead. */
1009 basic_block
1010 split_edge_and_insert (edge e, rtx insns)
1012 basic_block bb;
1014 if (!insns)
1015 return NULL;
1016 bb = split_edge (e);
1017 emit_insn_after (insns, BB_END (bb));
1019 /* ??? We used to assume that INSNS can contain control flow insns, and
1020 that we had to try to find sub basic blocks in BB to maintain a valid
1021 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
1022 and call break_superblocks when going out of cfglayout mode. But it
1023 turns out that this never happens; and that if it does ever happen,
1024 the verify_flow_info at the end of the RTL loop passes would fail.
1026 There are two reasons why we expected we could have control flow insns
1027 in INSNS. The first is when a comparison has to be done in parts, and
1028 the second is when the number of iterations is computed for loops with
1029 the number of iterations known at runtime. In both cases, test cases
1030 to get control flow in INSNS appear to be impossible to construct:
1032 * If do_compare_rtx_and_jump needs several branches to do comparison
1033 in a mode that needs comparison by parts, we cannot analyze the
1034 number of iterations of the loop, and we never get to unrolling it.
1036 * The code in expand_divmod that was suspected to cause creation of
1037 branching code seems to be only accessed for signed division. The
1038 divisions used by # of iterations analysis are always unsigned.
1039 Problems might arise on architectures that emits branching code
1040 for some operations that may appear in the unroller (especially
1041 for division), but we have no such architectures.
1043 Considering all this, it was decided that we should for now assume
1044 that INSNS can in theory contain control flow insns, but in practice
1045 it never does. So we don't handle the theoretical case, and should
1046 a real failure ever show up, we have a pretty good clue for how to
1047 fix it. */
1049 return bb;
1052 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1053 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
1054 in order to create a jump. */
1056 static rtx
1057 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1058 rtx cinsn)
1060 rtx seq, jump, cond;
1061 enum machine_mode mode;
1063 mode = GET_MODE (op0);
1064 if (mode == VOIDmode)
1065 mode = GET_MODE (op1);
1067 start_sequence ();
1068 if (GET_MODE_CLASS (mode) == MODE_CC)
1070 /* A hack -- there seems to be no easy generic way how to make a
1071 conditional jump from a ccmode comparison. */
1072 gcc_assert (cinsn);
1073 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1074 gcc_assert (GET_CODE (cond) == comp);
1075 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1076 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1077 emit_jump_insn (copy_insn (PATTERN (cinsn)));
1078 jump = get_last_insn ();
1079 gcc_assert (JUMP_P (jump));
1080 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1081 LABEL_NUSES (JUMP_LABEL (jump))++;
1082 redirect_jump (jump, label, 0);
1084 else
1086 gcc_assert (!cinsn);
1088 op0 = force_operand (op0, NULL_RTX);
1089 op1 = force_operand (op1, NULL_RTX);
1090 do_compare_rtx_and_jump (op0, op1, comp, 0,
1091 mode, NULL_RTX, NULL_RTX, label, -1);
1092 jump = get_last_insn ();
1093 gcc_assert (JUMP_P (jump));
1094 JUMP_LABEL (jump) = label;
1095 LABEL_NUSES (label)++;
1097 add_int_reg_note (jump, REG_BR_PROB, prob);
1099 seq = get_insns ();
1100 end_sequence ();
1102 return seq;
1105 /* Unroll LOOP for which we are able to count number of iterations in runtime
1106 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
1107 extra care for case n < 0):
1109 for (i = 0; i < n; i++)
1110 body;
1112 ==> (LOOP->LPT_DECISION.TIMES == 3)
1114 i = 0;
1115 mod = n % 4;
1117 switch (mod)
1119 case 3:
1120 body; i++;
1121 case 2:
1122 body; i++;
1123 case 1:
1124 body; i++;
1125 case 0: ;
1128 while (i < n)
1130 body; i++;
1131 body; i++;
1132 body; i++;
1133 body; i++;
1136 static void
1137 unroll_loop_runtime_iterations (struct loop *loop)
1139 rtx old_niter, niter, init_code, branch_code, tmp;
1140 unsigned i, j, p;
1141 basic_block preheader, *body, swtch, ezc_swtch;
1142 sbitmap wont_exit;
1143 int may_exit_copy;
1144 unsigned n_peel;
1145 edge e;
1146 bool extra_zero_check, last_may_exit;
1147 unsigned max_unroll = loop->lpt_decision.times;
1148 struct niter_desc *desc = get_simple_loop_desc (loop);
1149 bool exit_at_end = loop_exit_at_end_p (loop);
1150 struct opt_info *opt_info = NULL;
1151 bool ok;
1153 if (flag_split_ivs_in_unroller
1154 || flag_variable_expansion_in_unroller)
1155 opt_info = analyze_insns_in_loop (loop);
1157 /* Remember blocks whose dominators will have to be updated. */
1158 auto_vec<basic_block> dom_bbs;
1160 body = get_loop_body (loop);
1161 for (i = 0; i < loop->num_nodes; i++)
1163 vec<basic_block> ldom;
1164 basic_block bb;
1166 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1167 FOR_EACH_VEC_ELT (ldom, j, bb)
1168 if (!flow_bb_inside_loop_p (loop, bb))
1169 dom_bbs.safe_push (bb);
1171 ldom.release ();
1173 free (body);
1175 if (!exit_at_end)
1177 /* Leave exit in first copy (for explanation why see comment in
1178 unroll_loop_constant_iterations). */
1179 may_exit_copy = 0;
1180 n_peel = max_unroll - 1;
1181 extra_zero_check = true;
1182 last_may_exit = false;
1184 else
1186 /* Leave exit in last copy (for explanation why see comment in
1187 unroll_loop_constant_iterations). */
1188 may_exit_copy = max_unroll;
1189 n_peel = max_unroll;
1190 extra_zero_check = false;
1191 last_may_exit = true;
1194 /* Get expression for number of iterations. */
1195 start_sequence ();
1196 old_niter = niter = gen_reg_rtx (desc->mode);
1197 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1198 if (tmp != niter)
1199 emit_move_insn (niter, tmp);
1201 /* Count modulo by ANDing it with max_unroll; we use the fact that
1202 the number of unrollings is a power of two, and thus this is correct
1203 even if there is overflow in the computation. */
1204 niter = expand_simple_binop (desc->mode, AND,
1205 niter, gen_int_mode (max_unroll, desc->mode),
1206 NULL_RTX, 0, OPTAB_LIB_WIDEN);
1208 init_code = get_insns ();
1209 end_sequence ();
1210 unshare_all_rtl_in_chain (init_code);
1212 /* Precondition the loop. */
1213 split_edge_and_insert (loop_preheader_edge (loop), init_code);
1215 auto_vec<edge> remove_edges;
1217 wont_exit = sbitmap_alloc (max_unroll + 2);
1219 /* Peel the first copy of loop body (almost always we must leave exit test
1220 here; the only exception is when we have extra zero check and the number
1221 of iterations is reliable. Also record the place of (possible) extra
1222 zero check. */
1223 bitmap_clear (wont_exit);
1224 if (extra_zero_check
1225 && !desc->noloop_assumptions)
1226 bitmap_set_bit (wont_exit, 1);
1227 ezc_swtch = loop_preheader_edge (loop)->src;
1228 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1229 1, wont_exit, desc->out_edge,
1230 &remove_edges,
1231 DLTHE_FLAG_UPDATE_FREQ);
1232 gcc_assert (ok);
1234 /* Record the place where switch will be built for preconditioning. */
1235 swtch = split_edge (loop_preheader_edge (loop));
1237 for (i = 0; i < n_peel; i++)
1239 /* Peel the copy. */
1240 bitmap_clear (wont_exit);
1241 if (i != n_peel - 1 || !last_may_exit)
1242 bitmap_set_bit (wont_exit, 1);
1243 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1244 1, wont_exit, desc->out_edge,
1245 &remove_edges,
1246 DLTHE_FLAG_UPDATE_FREQ);
1247 gcc_assert (ok);
1249 /* Create item for switch. */
1250 j = n_peel - i - (extra_zero_check ? 0 : 1);
1251 p = REG_BR_PROB_BASE / (i + 2);
1253 preheader = split_edge (loop_preheader_edge (loop));
1254 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1255 block_label (preheader), p,
1256 NULL_RTX);
1258 /* We rely on the fact that the compare and jump cannot be optimized out,
1259 and hence the cfg we create is correct. */
1260 gcc_assert (branch_code != NULL_RTX);
1262 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1263 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1264 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1265 e = make_edge (swtch, preheader,
1266 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1267 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1268 e->probability = p;
1271 if (extra_zero_check)
1273 /* Add branch for zero iterations. */
1274 p = REG_BR_PROB_BASE / (max_unroll + 1);
1275 swtch = ezc_swtch;
1276 preheader = split_edge (loop_preheader_edge (loop));
1277 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1278 block_label (preheader), p,
1279 NULL_RTX);
1280 gcc_assert (branch_code != NULL_RTX);
1282 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1283 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1284 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1285 e = make_edge (swtch, preheader,
1286 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1287 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1288 e->probability = p;
1291 /* Recount dominators for outer blocks. */
1292 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1294 /* And unroll loop. */
1296 bitmap_ones (wont_exit);
1297 bitmap_clear_bit (wont_exit, may_exit_copy);
1298 opt_info_start_duplication (opt_info);
1300 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1301 max_unroll,
1302 wont_exit, desc->out_edge,
1303 &remove_edges,
1304 DLTHE_FLAG_UPDATE_FREQ
1305 | (opt_info
1306 ? DLTHE_RECORD_COPY_NUMBER
1307 : 0));
1308 gcc_assert (ok);
1310 if (opt_info)
1312 apply_opt_in_copies (opt_info, max_unroll, true, true);
1313 free_opt_info (opt_info);
1316 free (wont_exit);
1318 if (exit_at_end)
1320 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1321 /* Find a new in and out edge; they are in the last copy we have
1322 made. */
1324 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1326 desc->out_edge = EDGE_SUCC (exit_block, 0);
1327 desc->in_edge = EDGE_SUCC (exit_block, 1);
1329 else
1331 desc->out_edge = EDGE_SUCC (exit_block, 1);
1332 desc->in_edge = EDGE_SUCC (exit_block, 0);
1336 /* Remove the edges. */
1337 FOR_EACH_VEC_ELT (remove_edges, i, e)
1338 remove_path (e);
1340 /* We must be careful when updating the number of iterations due to
1341 preconditioning and the fact that the value must be valid at entry
1342 of the loop. After passing through the above code, we see that
1343 the correct new number of iterations is this: */
1344 gcc_assert (!desc->const_iter);
1345 desc->niter_expr =
1346 simplify_gen_binary (UDIV, desc->mode, old_niter,
1347 gen_int_mode (max_unroll + 1, desc->mode));
1348 loop->nb_iterations_upper_bound
1349 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1350 if (loop->any_estimate)
1351 loop->nb_iterations_estimate
1352 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1353 if (exit_at_end)
1355 desc->niter_expr =
1356 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1357 desc->noloop_assumptions = NULL_RTX;
1358 --loop->nb_iterations_upper_bound;
1359 if (loop->any_estimate
1360 && loop->nb_iterations_estimate != 0)
1361 --loop->nb_iterations_estimate;
1362 else
1363 loop->any_estimate = false;
1366 if (dump_file)
1367 fprintf (dump_file,
1368 ";; Unrolled loop %d times, counting # of iterations "
1369 "in runtime, %i insns\n",
1370 max_unroll, num_loop_insns (loop));
1373 /* Decide whether to simply peel LOOP and how much. */
1374 static void
1375 decide_peel_simple (struct loop *loop, int flags)
1377 unsigned npeel;
1378 widest_int iterations;
1380 if (!(flags & UAP_PEEL))
1382 /* We were not asked to, just return back silently. */
1383 return;
1386 if (dump_file)
1387 fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1389 /* npeel = number of iterations to peel. */
1390 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1391 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1392 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1394 /* Skip big loops. */
1395 if (!npeel)
1397 if (dump_file)
1398 fprintf (dump_file, ";; Not considering loop, is too big\n");
1399 return;
1402 /* Do not simply peel loops with branches inside -- it increases number
1403 of mispredicts.
1404 Exception is when we do have profile and we however have good chance
1405 to peel proper number of iterations loop will iterate in practice.
1406 TODO: this heuristic needs tunning; while for complette unrolling
1407 the branch inside loop mostly eliminates any improvements, for
1408 peeling it is not the case. Also a function call inside loop is
1409 also branch from branch prediction POV (and probably better reason
1410 to not unroll/peel). */
1411 if (num_loop_branches (loop) > 1
1412 && profile_status_for_fn (cfun) != PROFILE_READ)
1414 if (dump_file)
1415 fprintf (dump_file, ";; Not peeling, contains branches\n");
1416 return;
1419 /* If we have realistic estimate on number of iterations, use it. */
1420 if (get_estimated_loop_iterations (loop, &iterations))
1422 if (wi::leu_p (npeel, iterations))
1424 if (dump_file)
1426 fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1427 fprintf (dump_file, "%"PRId64,
1428 (int64_t) (iterations.to_shwi () + 1));
1429 fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1430 npeel);
1432 return;
1434 npeel = iterations.to_shwi () + 1;
1436 /* If we have small enough bound on iterations, we can still peel (completely
1437 unroll). */
1438 else if (get_max_loop_iterations (loop, &iterations)
1439 && wi::ltu_p (iterations, npeel))
1440 npeel = iterations.to_shwi () + 1;
1441 else
1443 /* For now we have no good heuristics to decide whether loop peeling
1444 will be effective, so disable it. */
1445 if (dump_file)
1446 fprintf (dump_file,
1447 ";; Not peeling loop, no evidence it will be profitable\n");
1448 return;
1451 /* Success. */
1452 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1453 loop->lpt_decision.times = npeel;
1456 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1458 while (cond)
1459 body;
1461 ==> (LOOP->LPT_DECISION.TIMES == 3)
1463 if (!cond) goto end;
1464 body;
1465 if (!cond) goto end;
1466 body;
1467 if (!cond) goto end;
1468 body;
1469 while (cond)
1470 body;
1471 end: ;
1473 static void
1474 peel_loop_simple (struct loop *loop)
1476 sbitmap wont_exit;
1477 unsigned npeel = loop->lpt_decision.times;
1478 struct niter_desc *desc = get_simple_loop_desc (loop);
1479 struct opt_info *opt_info = NULL;
1480 bool ok;
1482 if (flag_split_ivs_in_unroller && npeel > 1)
1483 opt_info = analyze_insns_in_loop (loop);
1485 wont_exit = sbitmap_alloc (npeel + 1);
1486 bitmap_clear (wont_exit);
1488 opt_info_start_duplication (opt_info);
1490 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1491 npeel, wont_exit, NULL,
1492 NULL, DLTHE_FLAG_UPDATE_FREQ
1493 | (opt_info
1494 ? DLTHE_RECORD_COPY_NUMBER
1495 : 0));
1496 gcc_assert (ok);
1498 free (wont_exit);
1500 if (opt_info)
1502 apply_opt_in_copies (opt_info, npeel, false, false);
1503 free_opt_info (opt_info);
1506 if (desc->simple_p)
1508 if (desc->const_iter)
1510 desc->niter -= npeel;
1511 desc->niter_expr = GEN_INT (desc->niter);
1512 desc->noloop_assumptions = NULL_RTX;
1514 else
1516 /* We cannot just update niter_expr, as its value might be clobbered
1517 inside loop. We could handle this by counting the number into
1518 temporary just like we do in runtime unrolling, but it does not
1519 seem worthwhile. */
1520 free_simple_loop_desc (loop);
1523 if (dump_file)
1524 fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1527 /* Decide whether to unroll LOOP stupidly and how much. */
1528 static void
1529 decide_unroll_stupid (struct loop *loop, int flags)
1531 unsigned nunroll, nunroll_by_av, i;
1532 struct niter_desc *desc;
1533 widest_int iterations;
1535 if (!(flags & UAP_UNROLL_ALL))
1537 /* We were not asked to, just return back silently. */
1538 return;
1541 if (dump_file)
1542 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1544 /* nunroll = total number of copies of the original loop body in
1545 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1546 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1547 nunroll_by_av
1548 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1549 if (nunroll > nunroll_by_av)
1550 nunroll = nunroll_by_av;
1551 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1552 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1554 if (targetm.loop_unroll_adjust)
1555 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1557 /* Skip big loops. */
1558 if (nunroll <= 1)
1560 if (dump_file)
1561 fprintf (dump_file, ";; Not considering loop, is too big\n");
1562 return;
1565 /* Check for simple loops. */
1566 desc = get_simple_loop_desc (loop);
1568 /* Check simpleness. */
1569 if (desc->simple_p && !desc->assumptions)
1571 if (dump_file)
1572 fprintf (dump_file, ";; The loop is simple\n");
1573 return;
1576 /* Do not unroll loops with branches inside -- it increases number
1577 of mispredicts.
1578 TODO: this heuristic needs tunning; call inside the loop body
1579 is also relatively good reason to not unroll. */
1580 if (num_loop_branches (loop) > 1)
1582 if (dump_file)
1583 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1584 return;
1587 /* Check whether the loop rolls. */
1588 if ((get_estimated_loop_iterations (loop, &iterations)
1589 || get_max_loop_iterations (loop, &iterations))
1590 && wi::ltu_p (iterations, 2 * nunroll))
1592 if (dump_file)
1593 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1594 return;
1597 /* Success. Now force nunroll to be power of 2, as it seems that this
1598 improves results (partially because of better alignments, partially
1599 because of some dark magic). */
1600 for (i = 1; 2 * i <= nunroll; i *= 2)
1601 continue;
1603 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1604 loop->lpt_decision.times = i - 1;
1607 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1609 while (cond)
1610 body;
1612 ==> (LOOP->LPT_DECISION.TIMES == 3)
1614 while (cond)
1616 body;
1617 if (!cond) break;
1618 body;
1619 if (!cond) break;
1620 body;
1621 if (!cond) break;
1622 body;
1625 static void
1626 unroll_loop_stupid (struct loop *loop)
1628 sbitmap wont_exit;
1629 unsigned nunroll = loop->lpt_decision.times;
1630 struct niter_desc *desc = get_simple_loop_desc (loop);
1631 struct opt_info *opt_info = NULL;
1632 bool ok;
1634 if (flag_split_ivs_in_unroller
1635 || flag_variable_expansion_in_unroller)
1636 opt_info = analyze_insns_in_loop (loop);
1639 wont_exit = sbitmap_alloc (nunroll + 1);
1640 bitmap_clear (wont_exit);
1641 opt_info_start_duplication (opt_info);
1643 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1644 nunroll, wont_exit,
1645 NULL, NULL,
1646 DLTHE_FLAG_UPDATE_FREQ
1647 | (opt_info
1648 ? DLTHE_RECORD_COPY_NUMBER
1649 : 0));
1650 gcc_assert (ok);
1652 if (opt_info)
1654 apply_opt_in_copies (opt_info, nunroll, true, true);
1655 free_opt_info (opt_info);
1658 free (wont_exit);
1660 if (desc->simple_p)
1662 /* We indeed may get here provided that there are nontrivial assumptions
1663 for a loop to be really simple. We could update the counts, but the
1664 problem is that we are unable to decide which exit will be taken
1665 (not really true in case the number of iterations is constant,
1666 but no one will do anything with this information, so we do not
1667 worry about it). */
1668 desc->simple_p = false;
1671 if (dump_file)
1672 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1673 nunroll, num_loop_insns (loop));
1676 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1677 Set *DEBUG_USES to the number of debug insns that reference the
1678 variable. */
1680 bool
1681 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1682 int *debug_uses)
1684 basic_block *body, bb;
1685 unsigned i;
1686 int count_ref = 0;
1687 rtx insn;
1689 body = get_loop_body (loop);
1690 for (i = 0; i < loop->num_nodes; i++)
1692 bb = body[i];
1694 FOR_BB_INSNS (bb, insn)
1695 if (!rtx_referenced_p (reg, insn))
1696 continue;
1697 else if (DEBUG_INSN_P (insn))
1698 ++*debug_uses;
1699 else if (++count_ref > 1)
1700 break;
1702 free (body);
1703 return (count_ref == 1);
1706 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1708 static void
1709 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1711 basic_block *body, bb;
1712 unsigned i;
1713 rtx insn;
1715 body = get_loop_body (loop);
1716 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1718 bb = body[i];
1720 FOR_BB_INSNS (bb, insn)
1721 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1722 continue;
1723 else
1725 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1726 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1727 if (!--debug_uses)
1728 break;
1731 free (body);
1734 /* Determine whether INSN contains an accumulator
1735 which can be expanded into separate copies,
1736 one for each copy of the LOOP body.
1738 for (i = 0 ; i < n; i++)
1739 sum += a[i];
1743 sum += a[i]
1744 ....
1745 i = i+1;
1746 sum1 += a[i]
1747 ....
1748 i = i+1
1749 sum2 += a[i];
1750 ....
1752 Return NULL if INSN contains no opportunity for expansion of accumulator.
1753 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1754 information and return a pointer to it.
1757 static struct var_to_expand *
1758 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1760 rtx set, dest, src;
1761 struct var_to_expand *ves;
1762 unsigned accum_pos;
1763 enum rtx_code code;
1764 int debug_uses = 0;
1766 set = single_set (insn);
1767 if (!set)
1768 return NULL;
1770 dest = SET_DEST (set);
1771 src = SET_SRC (set);
1772 code = GET_CODE (src);
1774 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1775 return NULL;
1777 if (FLOAT_MODE_P (GET_MODE (dest)))
1779 if (!flag_associative_math)
1780 return NULL;
1781 /* In the case of FMA, we're also changing the rounding. */
1782 if (code == FMA && !flag_unsafe_math_optimizations)
1783 return NULL;
1786 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1787 in MD. But if there is no optab to generate the insn, we can not
1788 perform the variable expansion. This can happen if an MD provides
1789 an insn but not a named pattern to generate it, for example to avoid
1790 producing code that needs additional mode switches like for x87/mmx.
1792 So we check have_insn_for which looks for an optab for the operation
1793 in SRC. If it doesn't exist, we can't perform the expansion even
1794 though INSN is valid. */
1795 if (!have_insn_for (code, GET_MODE (src)))
1796 return NULL;
1798 if (!REG_P (dest)
1799 && !(GET_CODE (dest) == SUBREG
1800 && REG_P (SUBREG_REG (dest))))
1801 return NULL;
1803 /* Find the accumulator use within the operation. */
1804 if (code == FMA)
1806 /* We only support accumulation via FMA in the ADD position. */
1807 if (!rtx_equal_p (dest, XEXP (src, 2)))
1808 return NULL;
1809 accum_pos = 2;
1811 else if (rtx_equal_p (dest, XEXP (src, 0)))
1812 accum_pos = 0;
1813 else if (rtx_equal_p (dest, XEXP (src, 1)))
1815 /* The method of expansion that we are using; which includes the
1816 initialization of the expansions with zero and the summation of
1817 the expansions at the end of the computation will yield wrong
1818 results for (x = something - x) thus avoid using it in that case. */
1819 if (code == MINUS)
1820 return NULL;
1821 accum_pos = 1;
1823 else
1824 return NULL;
1826 /* It must not otherwise be used. */
1827 if (code == FMA)
1829 if (rtx_referenced_p (dest, XEXP (src, 0))
1830 || rtx_referenced_p (dest, XEXP (src, 1)))
1831 return NULL;
1833 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1834 return NULL;
1836 /* It must be used in exactly one insn. */
1837 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1838 return NULL;
1840 if (dump_file)
1842 fprintf (dump_file, "\n;; Expanding Accumulator ");
1843 print_rtl (dump_file, dest);
1844 fprintf (dump_file, "\n");
1847 if (debug_uses)
1848 /* Instead of resetting the debug insns, we could replace each
1849 debug use in the loop with the sum or product of all expanded
1850 accummulators. Since we'll only know of all expansions at the
1851 end, we'd have to keep track of which vars_to_expand a debug
1852 insn in the loop references, take note of each copy of the
1853 debug insn during unrolling, and when it's all done, compute
1854 the sum or product of each variable and adjust the original
1855 debug insn and each copy thereof. What a pain! */
1856 reset_debug_uses_in_loop (loop, dest, debug_uses);
1858 /* Record the accumulator to expand. */
1859 ves = XNEW (struct var_to_expand);
1860 ves->insn = insn;
1861 ves->reg = copy_rtx (dest);
1862 ves->var_expansions.create (1);
1863 ves->next = NULL;
1864 ves->op = GET_CODE (src);
1865 ves->expansion_count = 0;
1866 ves->reuse_expansion = 0;
1867 return ves;
1870 /* Determine whether there is an induction variable in INSN that
1871 we would like to split during unrolling.
1873 I.e. replace
1875 i = i + 1;
1877 i = i + 1;
1879 i = i + 1;
1882 type chains by
1884 i0 = i + 1
1886 i = i0 + 1
1888 i = i0 + 2
1891 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1892 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1893 pointer to it. */
1895 static struct iv_to_split *
1896 analyze_iv_to_split_insn (rtx insn)
1898 rtx set, dest;
1899 struct rtx_iv iv;
1900 struct iv_to_split *ivts;
1901 bool ok;
1903 /* For now we just split the basic induction variables. Later this may be
1904 extended for example by selecting also addresses of memory references. */
1905 set = single_set (insn);
1906 if (!set)
1907 return NULL;
1909 dest = SET_DEST (set);
1910 if (!REG_P (dest))
1911 return NULL;
1913 if (!biv_p (insn, dest))
1914 return NULL;
1916 ok = iv_analyze_result (insn, dest, &iv);
1918 /* This used to be an assert under the assumption that if biv_p returns
1919 true that iv_analyze_result must also return true. However, that
1920 assumption is not strictly correct as evidenced by pr25569.
1922 Returning NULL when iv_analyze_result returns false is safe and
1923 avoids the problems in pr25569 until the iv_analyze_* routines
1924 can be fixed, which is apparently hard and time consuming
1925 according to their author. */
1926 if (! ok)
1927 return NULL;
1929 if (iv.step == const0_rtx
1930 || iv.mode != iv.extend_mode)
1931 return NULL;
1933 /* Record the insn to split. */
1934 ivts = XNEW (struct iv_to_split);
1935 ivts->insn = insn;
1936 ivts->orig_var = dest;
1937 ivts->base_var = NULL_RTX;
1938 ivts->step = iv.step;
1939 ivts->next = NULL;
1941 return ivts;
1944 /* Determines which of insns in LOOP can be optimized.
1945 Return a OPT_INFO struct with the relevant hash tables filled
1946 with all insns to be optimized. The FIRST_NEW_BLOCK field
1947 is undefined for the return value. */
1949 static struct opt_info *
1950 analyze_insns_in_loop (struct loop *loop)
1952 basic_block *body, bb;
1953 unsigned i;
1954 struct opt_info *opt_info = XCNEW (struct opt_info);
1955 rtx insn;
1956 struct iv_to_split *ivts = NULL;
1957 struct var_to_expand *ves = NULL;
1958 iv_to_split **slot1;
1959 var_to_expand **slot2;
1960 vec<edge> edges = get_loop_exit_edges (loop);
1961 edge exit;
1962 bool can_apply = false;
1964 iv_analysis_loop_init (loop);
1966 body = get_loop_body (loop);
1968 if (flag_split_ivs_in_unroller)
1970 opt_info->insns_to_split
1971 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1972 opt_info->iv_to_split_head = NULL;
1973 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1976 /* Record the loop exit bb and loop preheader before the unrolling. */
1977 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1979 if (edges.length () == 1)
1981 exit = edges[0];
1982 if (!(exit->flags & EDGE_COMPLEX))
1984 opt_info->loop_exit = split_edge (exit);
1985 can_apply = true;
1989 if (flag_variable_expansion_in_unroller
1990 && can_apply)
1992 opt_info->insns_with_var_to_expand
1993 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1994 opt_info->var_to_expand_head = NULL;
1995 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1998 for (i = 0; i < loop->num_nodes; i++)
2000 bb = body[i];
2001 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2002 continue;
2004 FOR_BB_INSNS (bb, insn)
2006 if (!INSN_P (insn))
2007 continue;
2009 if (opt_info->insns_to_split)
2010 ivts = analyze_iv_to_split_insn (insn);
2012 if (ivts)
2014 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
2015 gcc_assert (*slot1 == NULL);
2016 *slot1 = ivts;
2017 *opt_info->iv_to_split_tail = ivts;
2018 opt_info->iv_to_split_tail = &ivts->next;
2019 continue;
2022 if (opt_info->insns_with_var_to_expand)
2023 ves = analyze_insn_to_expand_var (loop, insn);
2025 if (ves)
2027 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
2028 gcc_assert (*slot2 == NULL);
2029 *slot2 = ves;
2030 *opt_info->var_to_expand_tail = ves;
2031 opt_info->var_to_expand_tail = &ves->next;
2036 edges.release ();
2037 free (body);
2038 return opt_info;
2041 /* Called just before loop duplication. Records start of duplicated area
2042 to OPT_INFO. */
2044 static void
2045 opt_info_start_duplication (struct opt_info *opt_info)
2047 if (opt_info)
2048 opt_info->first_new_block = last_basic_block_for_fn (cfun);
2051 /* Determine the number of iterations between initialization of the base
2052 variable and the current copy (N_COPY). N_COPIES is the total number
2053 of newly created copies. UNROLLING is true if we are unrolling
2054 (not peeling) the loop. */
2056 static unsigned
2057 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2059 if (unrolling)
2061 /* If we are unrolling, initialization is done in the original loop
2062 body (number 0). */
2063 return n_copy;
2065 else
2067 /* If we are peeling, the copy in that the initialization occurs has
2068 number 1. The original loop (number 0) is the last. */
2069 if (n_copy)
2070 return n_copy - 1;
2071 else
2072 return n_copies;
2076 /* Allocate basic variable for the induction variable chain. */
2078 static void
2079 allocate_basic_variable (struct iv_to_split *ivts)
2081 rtx expr = SET_SRC (single_set (ivts->insn));
2083 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2086 /* Insert initialization of basic variable of IVTS before INSN, taking
2087 the initial value from INSN. */
2089 static void
2090 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2092 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
2093 rtx seq;
2095 start_sequence ();
2096 expr = force_operand (expr, ivts->base_var);
2097 if (expr != ivts->base_var)
2098 emit_move_insn (ivts->base_var, expr);
2099 seq = get_insns ();
2100 end_sequence ();
2102 emit_insn_before (seq, insn);
2105 /* Replace the use of induction variable described in IVTS in INSN
2106 by base variable + DELTA * step. */
2108 static void
2109 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2111 rtx expr, *loc, seq, incr, var;
2112 enum machine_mode mode = GET_MODE (ivts->base_var);
2113 rtx src, dest, set;
2115 /* Construct base + DELTA * step. */
2116 if (!delta)
2117 expr = ivts->base_var;
2118 else
2120 incr = simplify_gen_binary (MULT, mode,
2121 ivts->step, gen_int_mode (delta, mode));
2122 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2123 ivts->base_var, incr);
2126 /* Figure out where to do the replacement. */
2127 loc = &SET_SRC (single_set (insn));
2129 /* If we can make the replacement right away, we're done. */
2130 if (validate_change (insn, loc, expr, 0))
2131 return;
2133 /* Otherwise, force EXPR into a register and try again. */
2134 start_sequence ();
2135 var = gen_reg_rtx (mode);
2136 expr = force_operand (expr, var);
2137 if (expr != var)
2138 emit_move_insn (var, expr);
2139 seq = get_insns ();
2140 end_sequence ();
2141 emit_insn_before (seq, insn);
2143 if (validate_change (insn, loc, var, 0))
2144 return;
2146 /* The last chance. Try recreating the assignment in insn
2147 completely from scratch. */
2148 set = single_set (insn);
2149 gcc_assert (set);
2151 start_sequence ();
2152 *loc = var;
2153 src = copy_rtx (SET_SRC (set));
2154 dest = copy_rtx (SET_DEST (set));
2155 src = force_operand (src, dest);
2156 if (src != dest)
2157 emit_move_insn (dest, src);
2158 seq = get_insns ();
2159 end_sequence ();
2161 emit_insn_before (seq, insn);
2162 delete_insn (insn);
2166 /* Return one expansion of the accumulator recorded in struct VE. */
2168 static rtx
2169 get_expansion (struct var_to_expand *ve)
2171 rtx reg;
2173 if (ve->reuse_expansion == 0)
2174 reg = ve->reg;
2175 else
2176 reg = ve->var_expansions[ve->reuse_expansion - 1];
2178 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2179 ve->reuse_expansion = 0;
2180 else
2181 ve->reuse_expansion++;
2183 return reg;
2187 /* Given INSN replace the uses of the accumulator recorded in VE
2188 with a new register. */
2190 static void
2191 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2193 rtx new_reg, set;
2194 bool really_new_expansion = false;
2196 set = single_set (insn);
2197 gcc_assert (set);
2199 /* Generate a new register only if the expansion limit has not been
2200 reached. Else reuse an already existing expansion. */
2201 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2203 really_new_expansion = true;
2204 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2206 else
2207 new_reg = get_expansion (ve);
2209 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2210 if (apply_change_group ())
2211 if (really_new_expansion)
2213 ve->var_expansions.safe_push (new_reg);
2214 ve->expansion_count++;
2218 /* Initialize the variable expansions in loop preheader. PLACE is the
2219 loop-preheader basic block where the initialization of the
2220 expansions should take place. The expansions are initialized with
2221 (-0) when the operation is plus or minus to honor sign zero. This
2222 way we can prevent cases where the sign of the final result is
2223 effected by the sign of the expansion. Here is an example to
2224 demonstrate this:
2226 for (i = 0 ; i < n; i++)
2227 sum += something;
2231 sum += something
2232 ....
2233 i = i+1;
2234 sum1 += something
2235 ....
2236 i = i+1
2237 sum2 += something;
2238 ....
2240 When SUM is initialized with -zero and SOMETHING is also -zero; the
2241 final result of sum should be -zero thus the expansions sum1 and sum2
2242 should be initialized with -zero as well (otherwise we will get +zero
2243 as the final result). */
2245 static void
2246 insert_var_expansion_initialization (struct var_to_expand *ve,
2247 basic_block place)
2249 rtx seq, var, zero_init;
2250 unsigned i;
2251 enum machine_mode mode = GET_MODE (ve->reg);
2252 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2254 if (ve->var_expansions.length () == 0)
2255 return;
2257 start_sequence ();
2258 switch (ve->op)
2260 case FMA:
2261 /* Note that we only accumulate FMA via the ADD operand. */
2262 case PLUS:
2263 case MINUS:
2264 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2266 if (honor_signed_zero_p)
2267 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2268 else
2269 zero_init = CONST0_RTX (mode);
2270 emit_move_insn (var, zero_init);
2272 break;
2274 case MULT:
2275 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2277 zero_init = CONST1_RTX (GET_MODE (var));
2278 emit_move_insn (var, zero_init);
2280 break;
2282 default:
2283 gcc_unreachable ();
2286 seq = get_insns ();
2287 end_sequence ();
2289 emit_insn_after (seq, BB_END (place));
2292 /* Combine the variable expansions at the loop exit. PLACE is the
2293 loop exit basic block where the summation of the expansions should
2294 take place. */
2296 static void
2297 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2299 rtx sum = ve->reg;
2300 rtx expr, seq, var, insn;
2301 unsigned i;
2303 if (ve->var_expansions.length () == 0)
2304 return;
2306 start_sequence ();
2307 switch (ve->op)
2309 case FMA:
2310 /* Note that we only accumulate FMA via the ADD operand. */
2311 case PLUS:
2312 case MINUS:
2313 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2314 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2315 break;
2317 case MULT:
2318 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2319 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2320 break;
2322 default:
2323 gcc_unreachable ();
2326 expr = force_operand (sum, ve->reg);
2327 if (expr != ve->reg)
2328 emit_move_insn (ve->reg, expr);
2329 seq = get_insns ();
2330 end_sequence ();
2332 insn = BB_HEAD (place);
2333 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2334 insn = NEXT_INSN (insn);
2336 emit_insn_after (seq, insn);
2339 /* Strip away REG_EQUAL notes for IVs we're splitting.
2341 Updating REG_EQUAL notes for IVs we split is tricky: We
2342 cannot tell until after unrolling, DF-rescanning, and liveness
2343 updating, whether an EQ_USE is reached by the split IV while
2344 the IV reg is still live. See PR55006.
2346 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2347 because RTL loop-iv requires us to defer rescanning insns and
2348 any notes attached to them. So resort to old techniques... */
2350 static void
2351 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2353 struct iv_to_split *ivts;
2354 rtx note = find_reg_equal_equiv_note (insn);
2355 if (! note)
2356 return;
2357 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2358 if (reg_mentioned_p (ivts->orig_var, note))
2360 remove_note (insn, note);
2361 return;
2365 /* Apply loop optimizations in loop copies using the
2366 data which gathered during the unrolling. Structure
2367 OPT_INFO record that data.
2369 UNROLLING is true if we unrolled (not peeled) the loop.
2370 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2371 the loop (as it should happen in complete unrolling, but not in ordinary
2372 peeling of the loop). */
2374 static void
2375 apply_opt_in_copies (struct opt_info *opt_info,
2376 unsigned n_copies, bool unrolling,
2377 bool rewrite_original_loop)
2379 unsigned i, delta;
2380 basic_block bb, orig_bb;
2381 rtx insn, orig_insn, next;
2382 struct iv_to_split ivts_templ, *ivts;
2383 struct var_to_expand ve_templ, *ves;
2385 /* Sanity check -- we need to put initialization in the original loop
2386 body. */
2387 gcc_assert (!unrolling || rewrite_original_loop);
2389 /* Allocate the basic variables (i0). */
2390 if (opt_info->insns_to_split)
2391 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2392 allocate_basic_variable (ivts);
2394 for (i = opt_info->first_new_block;
2395 i < (unsigned) last_basic_block_for_fn (cfun);
2396 i++)
2398 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2399 orig_bb = get_bb_original (bb);
2401 /* bb->aux holds position in copy sequence initialized by
2402 duplicate_loop_to_header_edge. */
2403 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2404 unrolling);
2405 bb->aux = 0;
2406 orig_insn = BB_HEAD (orig_bb);
2407 FOR_BB_INSNS_SAFE (bb, insn, next)
2409 if (!INSN_P (insn)
2410 || (DEBUG_INSN_P (insn)
2411 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2412 continue;
2414 while (!INSN_P (orig_insn)
2415 || (DEBUG_INSN_P (orig_insn)
2416 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2417 == LABEL_DECL)))
2418 orig_insn = NEXT_INSN (orig_insn);
2420 ivts_templ.insn = orig_insn;
2421 ve_templ.insn = orig_insn;
2423 /* Apply splitting iv optimization. */
2424 if (opt_info->insns_to_split)
2426 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2428 ivts = opt_info->insns_to_split->find (&ivts_templ);
2430 if (ivts)
2432 gcc_assert (GET_CODE (PATTERN (insn))
2433 == GET_CODE (PATTERN (orig_insn)));
2435 if (!delta)
2436 insert_base_initialization (ivts, insn);
2437 split_iv (ivts, insn, delta);
2440 /* Apply variable expansion optimization. */
2441 if (unrolling && opt_info->insns_with_var_to_expand)
2443 ves = (struct var_to_expand *)
2444 opt_info->insns_with_var_to_expand->find (&ve_templ);
2445 if (ves)
2447 gcc_assert (GET_CODE (PATTERN (insn))
2448 == GET_CODE (PATTERN (orig_insn)));
2449 expand_var_during_unrolling (ves, insn);
2452 orig_insn = NEXT_INSN (orig_insn);
2456 if (!rewrite_original_loop)
2457 return;
2459 /* Initialize the variable expansions in the loop preheader
2460 and take care of combining them at the loop exit. */
2461 if (opt_info->insns_with_var_to_expand)
2463 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2464 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2465 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2466 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2469 /* Rewrite also the original loop body. Find them as originals of the blocks
2470 in the last copied iteration, i.e. those that have
2471 get_bb_copy (get_bb_original (bb)) == bb. */
2472 for (i = opt_info->first_new_block;
2473 i < (unsigned) last_basic_block_for_fn (cfun);
2474 i++)
2476 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2477 orig_bb = get_bb_original (bb);
2478 if (get_bb_copy (orig_bb) != bb)
2479 continue;
2481 delta = determine_split_iv_delta (0, n_copies, unrolling);
2482 for (orig_insn = BB_HEAD (orig_bb);
2483 orig_insn != NEXT_INSN (BB_END (bb));
2484 orig_insn = next)
2486 next = NEXT_INSN (orig_insn);
2488 if (!INSN_P (orig_insn))
2489 continue;
2491 ivts_templ.insn = orig_insn;
2492 if (opt_info->insns_to_split)
2494 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2496 ivts = (struct iv_to_split *)
2497 opt_info->insns_to_split->find (&ivts_templ);
2498 if (ivts)
2500 if (!delta)
2501 insert_base_initialization (ivts, orig_insn);
2502 split_iv (ivts, orig_insn, delta);
2503 continue;
2511 /* Release OPT_INFO. */
2513 static void
2514 free_opt_info (struct opt_info *opt_info)
2516 delete opt_info->insns_to_split;
2517 opt_info->insns_to_split = NULL;
2518 if (opt_info->insns_with_var_to_expand)
2520 struct var_to_expand *ves;
2522 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2523 ves->var_expansions.release ();
2524 delete opt_info->insns_with_var_to_expand;
2525 opt_info->insns_with_var_to_expand = NULL;
2527 free (opt_info);