Remove assert in get_def_bb_for_const
[official-gcc.git] / gcc / loop-unroll.c
blob97735a89e5876188ce697554ae9bd5164bef0eeb
1 /* Loop unrolling.
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "optabs.h"
29 #include "emit-rtl.h"
30 #include "recog.h"
31 #include "profile.h"
32 #include "cfgrtl.h"
33 #include "cfgloop.h"
34 #include "params.h"
35 #include "dojump.h"
36 #include "expr.h"
37 #include "dumpfile.h"
39 /* This pass performs loop unrolling. We only perform this
40 optimization on innermost loops (with single exception) because
41 the impact on performance is greatest here, and we want to avoid
42 unnecessary code size growth. The gain is caused by greater sequentiality
43 of code, better code to optimize for further passes and in some cases
44 by fewer testings of exit conditions. The main problem is code growth,
45 that impacts performance negatively due to effect of caches.
47 What we do:
49 -- unrolling of loops that roll constant times; this is almost always
50 win, as we get rid of exit condition tests.
51 -- unrolling of loops that roll number of times that we can compute
52 in runtime; we also get rid of exit condition tests here, but there
53 is the extra expense for calculating the number of iterations
54 -- simple unrolling of remaining loops; this is performed only if we
55 are asked to, as the gain is questionable in this case and often
56 it may even slow down the code
57 For more detailed descriptions of each of those, see comments at
58 appropriate function below.
60 There is a lot of parameters (defined and described in params.def) that
61 control how much we unroll.
63 ??? A great problem is that we don't have a good way how to determine
64 how many times we should unroll the loop; the experiments I have made
65 showed that this choice may affect performance in order of several %.
68 /* Information about induction variables to split. */
70 struct iv_to_split
72 rtx_insn *insn; /* The insn in that the induction variable occurs. */
73 rtx orig_var; /* The variable (register) for the IV before split. */
74 rtx base_var; /* The variable on that the values in the further
75 iterations are based. */
76 rtx step; /* Step of the induction variable. */
77 struct iv_to_split *next; /* Next entry in walking order. */
80 /* Information about accumulators to expand. */
82 struct var_to_expand
84 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
85 rtx reg; /* The accumulator which is expanded. */
86 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
87 struct var_to_expand *next; /* Next entry in walking order. */
88 enum rtx_code op; /* The type of the accumulation - addition, subtraction
89 or multiplication. */
90 int expansion_count; /* Count the number of expansions generated so far. */
91 int reuse_expansion; /* The expansion we intend to reuse to expand
92 the accumulator. If REUSE_EXPANSION is 0 reuse
93 the original accumulator. Else use
94 var_expansions[REUSE_EXPANSION - 1]. */
97 /* Hashtable helper for iv_to_split. */
99 struct iv_split_hasher : free_ptr_hash <iv_to_split>
101 static inline hashval_t hash (const iv_to_split *);
102 static inline bool equal (const iv_to_split *, const iv_to_split *);
106 /* A hash function for information about insns to split. */
108 inline hashval_t
109 iv_split_hasher::hash (const iv_to_split *ivts)
111 return (hashval_t) INSN_UID (ivts->insn);
114 /* An equality functions for information about insns to split. */
116 inline bool
117 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
119 return i1->insn == i2->insn;
122 /* Hashtable helper for iv_to_split. */
124 struct var_expand_hasher : free_ptr_hash <var_to_expand>
126 static inline hashval_t hash (const var_to_expand *);
127 static inline bool equal (const var_to_expand *, const var_to_expand *);
130 /* Return a hash for VES. */
132 inline hashval_t
133 var_expand_hasher::hash (const var_to_expand *ves)
135 return (hashval_t) INSN_UID (ves->insn);
138 /* Return true if I1 and I2 refer to the same instruction. */
140 inline bool
141 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
143 return i1->insn == i2->insn;
146 /* Information about optimization applied in
147 the unrolled loop. */
149 struct opt_info
151 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
152 split. */
153 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
154 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
155 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
156 insns with accumulators to expand. */
157 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
158 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
159 unsigned first_new_block; /* The first basic block that was
160 duplicated. */
161 basic_block loop_exit; /* The loop exit basic block. */
162 basic_block loop_preheader; /* The loop preheader basic block. */
165 static void decide_unroll_stupid (struct loop *, int);
166 static void decide_unroll_constant_iterations (struct loop *, int);
167 static void decide_unroll_runtime_iterations (struct loop *, int);
168 static void unroll_loop_stupid (struct loop *);
169 static void decide_unrolling (int);
170 static void unroll_loop_constant_iterations (struct loop *);
171 static void unroll_loop_runtime_iterations (struct loop *);
172 static struct opt_info *analyze_insns_in_loop (struct loop *);
173 static void opt_info_start_duplication (struct opt_info *);
174 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
175 static void free_opt_info (struct opt_info *);
176 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
177 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
178 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
179 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
180 static void insert_var_expansion_initialization (struct var_to_expand *,
181 basic_block);
182 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
183 basic_block);
184 static rtx get_expansion (struct var_to_expand *);
186 /* Emit a message summarizing the unroll that will be
187 performed for LOOP, along with the loop's location LOCUS, if
188 appropriate given the dump or -fopt-info settings. */
190 static void
191 report_unroll (struct loop *loop, location_t locus)
193 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
195 if (loop->lpt_decision.decision == LPT_NONE)
196 return;
198 if (!dump_enabled_p ())
199 return;
201 dump_printf_loc (report_flags, locus,
202 "loop unrolled %d times",
203 loop->lpt_decision.times);
204 if (profile_info)
205 dump_printf (report_flags,
206 " (header execution count %d)",
207 (int)loop->header->count);
209 dump_printf (report_flags, "\n");
212 /* Decide whether unroll loops and how much. */
213 static void
214 decide_unrolling (int flags)
216 struct loop *loop;
218 /* Scan the loops, inner ones first. */
219 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
221 loop->lpt_decision.decision = LPT_NONE;
222 location_t locus = get_loop_location (loop);
224 if (dump_enabled_p ())
225 dump_printf_loc (TDF_RTL, locus,
226 ";; *** Considering loop %d at BB %d for "
227 "unrolling ***\n",
228 loop->num, loop->header->index);
230 /* Do not peel cold areas. */
231 if (optimize_loop_for_size_p (loop))
233 if (dump_file)
234 fprintf (dump_file, ";; Not considering loop, cold area\n");
235 continue;
238 /* Can the loop be manipulated? */
239 if (!can_duplicate_loop_p (loop))
241 if (dump_file)
242 fprintf (dump_file,
243 ";; Not considering loop, cannot duplicate\n");
244 continue;
247 /* Skip non-innermost loops. */
248 if (loop->inner)
250 if (dump_file)
251 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
252 continue;
255 loop->ninsns = num_loop_insns (loop);
256 loop->av_ninsns = average_num_loop_insns (loop);
258 /* Try transformations one by one in decreasing order of
259 priority. */
261 decide_unroll_constant_iterations (loop, flags);
262 if (loop->lpt_decision.decision == LPT_NONE)
263 decide_unroll_runtime_iterations (loop, flags);
264 if (loop->lpt_decision.decision == LPT_NONE)
265 decide_unroll_stupid (loop, flags);
267 report_unroll (loop, locus);
271 /* Unroll LOOPS. */
272 void
273 unroll_loops (int flags)
275 struct loop *loop;
276 bool changed = false;
278 /* Now decide rest of unrolling. */
279 decide_unrolling (flags);
281 /* Scan the loops, inner ones first. */
282 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
284 /* And perform the appropriate transformations. */
285 switch (loop->lpt_decision.decision)
287 case LPT_UNROLL_CONSTANT:
288 unroll_loop_constant_iterations (loop);
289 changed = true;
290 break;
291 case LPT_UNROLL_RUNTIME:
292 unroll_loop_runtime_iterations (loop);
293 changed = true;
294 break;
295 case LPT_UNROLL_STUPID:
296 unroll_loop_stupid (loop);
297 changed = true;
298 break;
299 case LPT_NONE:
300 break;
301 default:
302 gcc_unreachable ();
306 if (changed)
308 calculate_dominance_info (CDI_DOMINATORS);
309 fix_loop_structure (NULL);
312 iv_analysis_done ();
315 /* Check whether exit of the LOOP is at the end of loop body. */
317 static bool
318 loop_exit_at_end_p (struct loop *loop)
320 struct niter_desc *desc = get_simple_loop_desc (loop);
321 rtx_insn *insn;
323 /* We should never have conditional in latch block. */
324 gcc_assert (desc->in_edge->dest != loop->header);
326 if (desc->in_edge->dest != loop->latch)
327 return false;
329 /* Check that the latch is empty. */
330 FOR_BB_INSNS (loop->latch, insn)
332 if (INSN_P (insn) && active_insn_p (insn))
333 return false;
336 return true;
339 /* Decide whether to unroll LOOP iterating constant number of times
340 and how much. */
342 static void
343 decide_unroll_constant_iterations (struct loop *loop, int flags)
345 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
346 struct niter_desc *desc;
347 widest_int iterations;
349 if (!(flags & UAP_UNROLL))
351 /* We were not asked to, just return back silently. */
352 return;
355 if (dump_file)
356 fprintf (dump_file,
357 "\n;; Considering unrolling loop with constant "
358 "number of iterations\n");
360 /* nunroll = total number of copies of the original loop body in
361 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
362 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
363 nunroll_by_av
364 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
365 if (nunroll > nunroll_by_av)
366 nunroll = nunroll_by_av;
367 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
368 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
370 if (targetm.loop_unroll_adjust)
371 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
373 /* Skip big loops. */
374 if (nunroll <= 1)
376 if (dump_file)
377 fprintf (dump_file, ";; Not considering loop, is too big\n");
378 return;
381 /* Check for simple loops. */
382 desc = get_simple_loop_desc (loop);
384 /* Check number of iterations. */
385 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
387 if (dump_file)
388 fprintf (dump_file,
389 ";; Unable to prove that the loop iterates constant times\n");
390 return;
393 /* Check whether the loop rolls enough to consider.
394 Consult also loop bounds and profile; in the case the loop has more
395 than one exit it may well loop less than determined maximal number
396 of iterations. */
397 if (desc->niter < 2 * nunroll
398 || ((get_estimated_loop_iterations (loop, &iterations)
399 || get_max_loop_iterations (loop, &iterations))
400 && wi::ltu_p (iterations, 2 * nunroll)))
402 if (dump_file)
403 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
404 return;
407 /* Success; now compute number of iterations to unroll. We alter
408 nunroll so that as few as possible copies of loop body are
409 necessary, while still not decreasing the number of unrollings
410 too much (at most by 1). */
411 best_copies = 2 * nunroll + 10;
413 i = 2 * nunroll + 2;
414 if (i - 1 >= desc->niter)
415 i = desc->niter - 2;
417 for (; i >= nunroll - 1; i--)
419 unsigned exit_mod = desc->niter % (i + 1);
421 if (!loop_exit_at_end_p (loop))
422 n_copies = exit_mod + i + 1;
423 else if (exit_mod != (unsigned) i
424 || desc->noloop_assumptions != NULL_RTX)
425 n_copies = exit_mod + i + 2;
426 else
427 n_copies = i + 1;
429 if (n_copies < best_copies)
431 best_copies = n_copies;
432 best_unroll = i;
436 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
437 loop->lpt_decision.times = best_unroll;
440 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
441 The transformation does this:
443 for (i = 0; i < 102; i++)
444 body;
446 ==> (LOOP->LPT_DECISION.TIMES == 3)
448 i = 0;
449 body; i++;
450 body; i++;
451 while (i < 102)
453 body; i++;
454 body; i++;
455 body; i++;
456 body; i++;
459 static void
460 unroll_loop_constant_iterations (struct loop *loop)
462 unsigned HOST_WIDE_INT niter;
463 unsigned exit_mod;
464 sbitmap wont_exit;
465 unsigned i;
466 edge e;
467 unsigned max_unroll = loop->lpt_decision.times;
468 struct niter_desc *desc = get_simple_loop_desc (loop);
469 bool exit_at_end = loop_exit_at_end_p (loop);
470 struct opt_info *opt_info = NULL;
471 bool ok;
473 niter = desc->niter;
475 /* Should not get here (such loop should be peeled instead). */
476 gcc_assert (niter > max_unroll + 1);
478 exit_mod = niter % (max_unroll + 1);
480 wont_exit = sbitmap_alloc (max_unroll + 1);
481 bitmap_ones (wont_exit);
483 auto_vec<edge> remove_edges;
484 if (flag_split_ivs_in_unroller
485 || flag_variable_expansion_in_unroller)
486 opt_info = analyze_insns_in_loop (loop);
488 if (!exit_at_end)
490 /* The exit is not at the end of the loop; leave exit test
491 in the first copy, so that the loops that start with test
492 of exit condition have continuous body after unrolling. */
494 if (dump_file)
495 fprintf (dump_file, ";; Condition at beginning of loop.\n");
497 /* Peel exit_mod iterations. */
498 bitmap_clear_bit (wont_exit, 0);
499 if (desc->noloop_assumptions)
500 bitmap_clear_bit (wont_exit, 1);
502 if (exit_mod)
504 opt_info_start_duplication (opt_info);
505 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
506 exit_mod,
507 wont_exit, desc->out_edge,
508 &remove_edges,
509 DLTHE_FLAG_UPDATE_FREQ
510 | (opt_info && exit_mod > 1
511 ? DLTHE_RECORD_COPY_NUMBER
512 : 0));
513 gcc_assert (ok);
515 if (opt_info && exit_mod > 1)
516 apply_opt_in_copies (opt_info, exit_mod, false, false);
518 desc->noloop_assumptions = NULL_RTX;
519 desc->niter -= exit_mod;
520 loop->nb_iterations_upper_bound -= exit_mod;
521 if (loop->any_estimate
522 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
523 loop->nb_iterations_estimate -= exit_mod;
524 else
525 loop->any_estimate = false;
526 if (loop->any_likely_upper_bound
527 && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
528 loop->nb_iterations_likely_upper_bound -= exit_mod;
529 else
530 loop->any_likely_upper_bound = false;
533 bitmap_set_bit (wont_exit, 1);
535 else
537 /* Leave exit test in last copy, for the same reason as above if
538 the loop tests the condition at the end of loop body. */
540 if (dump_file)
541 fprintf (dump_file, ";; Condition at end of loop.\n");
543 /* We know that niter >= max_unroll + 2; so we do not need to care of
544 case when we would exit before reaching the loop. So just peel
545 exit_mod + 1 iterations. */
546 if (exit_mod != max_unroll
547 || desc->noloop_assumptions)
549 bitmap_clear_bit (wont_exit, 0);
550 if (desc->noloop_assumptions)
551 bitmap_clear_bit (wont_exit, 1);
553 opt_info_start_duplication (opt_info);
554 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
555 exit_mod + 1,
556 wont_exit, desc->out_edge,
557 &remove_edges,
558 DLTHE_FLAG_UPDATE_FREQ
559 | (opt_info && exit_mod > 0
560 ? DLTHE_RECORD_COPY_NUMBER
561 : 0));
562 gcc_assert (ok);
564 if (opt_info && exit_mod > 0)
565 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
567 desc->niter -= exit_mod + 1;
568 loop->nb_iterations_upper_bound -= exit_mod + 1;
569 if (loop->any_estimate
570 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
571 loop->nb_iterations_estimate -= exit_mod + 1;
572 else
573 loop->any_estimate = false;
574 if (loop->any_likely_upper_bound
575 && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
576 loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
577 else
578 loop->any_likely_upper_bound = false;
579 desc->noloop_assumptions = NULL_RTX;
581 bitmap_set_bit (wont_exit, 0);
582 bitmap_set_bit (wont_exit, 1);
585 bitmap_clear_bit (wont_exit, max_unroll);
588 /* Now unroll the loop. */
590 opt_info_start_duplication (opt_info);
591 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
592 max_unroll,
593 wont_exit, desc->out_edge,
594 &remove_edges,
595 DLTHE_FLAG_UPDATE_FREQ
596 | (opt_info
597 ? DLTHE_RECORD_COPY_NUMBER
598 : 0));
599 gcc_assert (ok);
601 if (opt_info)
603 apply_opt_in_copies (opt_info, max_unroll, true, true);
604 free_opt_info (opt_info);
607 free (wont_exit);
609 if (exit_at_end)
611 basic_block exit_block = get_bb_copy (desc->in_edge->src);
612 /* Find a new in and out edge; they are in the last copy we have made. */
614 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
616 desc->out_edge = EDGE_SUCC (exit_block, 0);
617 desc->in_edge = EDGE_SUCC (exit_block, 1);
619 else
621 desc->out_edge = EDGE_SUCC (exit_block, 1);
622 desc->in_edge = EDGE_SUCC (exit_block, 0);
626 desc->niter /= max_unroll + 1;
627 loop->nb_iterations_upper_bound
628 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
629 if (loop->any_estimate)
630 loop->nb_iterations_estimate
631 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
632 if (loop->any_likely_upper_bound)
633 loop->nb_iterations_likely_upper_bound
634 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
635 desc->niter_expr = GEN_INT (desc->niter);
637 /* Remove the edges. */
638 FOR_EACH_VEC_ELT (remove_edges, i, e)
639 remove_path (e);
641 if (dump_file)
642 fprintf (dump_file,
643 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
644 max_unroll, num_loop_insns (loop));
647 /* Decide whether to unroll LOOP iterating runtime computable number of times
648 and how much. */
649 static void
650 decide_unroll_runtime_iterations (struct loop *loop, int flags)
652 unsigned nunroll, nunroll_by_av, i;
653 struct niter_desc *desc;
654 widest_int iterations;
656 if (!(flags & UAP_UNROLL))
658 /* We were not asked to, just return back silently. */
659 return;
662 if (dump_file)
663 fprintf (dump_file,
664 "\n;; Considering unrolling loop with runtime "
665 "computable number of iterations\n");
667 /* nunroll = total number of copies of the original loop body in
668 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
669 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
670 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
671 if (nunroll > nunroll_by_av)
672 nunroll = nunroll_by_av;
673 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
674 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
676 if (targetm.loop_unroll_adjust)
677 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
679 /* Skip big loops. */
680 if (nunroll <= 1)
682 if (dump_file)
683 fprintf (dump_file, ";; Not considering loop, is too big\n");
684 return;
687 /* Check for simple loops. */
688 desc = get_simple_loop_desc (loop);
690 /* Check simpleness. */
691 if (!desc->simple_p || desc->assumptions)
693 if (dump_file)
694 fprintf (dump_file,
695 ";; Unable to prove that the number of iterations "
696 "can be counted in runtime\n");
697 return;
700 if (desc->const_iter)
702 if (dump_file)
703 fprintf (dump_file, ";; Loop iterates constant times\n");
704 return;
707 /* Check whether the loop rolls. */
708 if ((get_estimated_loop_iterations (loop, &iterations)
709 || get_max_loop_iterations (loop, &iterations))
710 && wi::ltu_p (iterations, 2 * nunroll))
712 if (dump_file)
713 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
714 return;
717 /* Success; now force nunroll to be power of 2, as we are unable to
718 cope with overflows in computation of number of iterations. */
719 for (i = 1; 2 * i <= nunroll; i *= 2)
720 continue;
722 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
723 loop->lpt_decision.times = i - 1;
726 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
727 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
728 and NULL is returned instead. */
730 basic_block
731 split_edge_and_insert (edge e, rtx_insn *insns)
733 basic_block bb;
735 if (!insns)
736 return NULL;
737 bb = split_edge (e);
738 emit_insn_after (insns, BB_END (bb));
740 /* ??? We used to assume that INSNS can contain control flow insns, and
741 that we had to try to find sub basic blocks in BB to maintain a valid
742 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
743 and call break_superblocks when going out of cfglayout mode. But it
744 turns out that this never happens; and that if it does ever happen,
745 the verify_flow_info at the end of the RTL loop passes would fail.
747 There are two reasons why we expected we could have control flow insns
748 in INSNS. The first is when a comparison has to be done in parts, and
749 the second is when the number of iterations is computed for loops with
750 the number of iterations known at runtime. In both cases, test cases
751 to get control flow in INSNS appear to be impossible to construct:
753 * If do_compare_rtx_and_jump needs several branches to do comparison
754 in a mode that needs comparison by parts, we cannot analyze the
755 number of iterations of the loop, and we never get to unrolling it.
757 * The code in expand_divmod that was suspected to cause creation of
758 branching code seems to be only accessed for signed division. The
759 divisions used by # of iterations analysis are always unsigned.
760 Problems might arise on architectures that emits branching code
761 for some operations that may appear in the unroller (especially
762 for division), but we have no such architectures.
764 Considering all this, it was decided that we should for now assume
765 that INSNS can in theory contain control flow insns, but in practice
766 it never does. So we don't handle the theoretical case, and should
767 a real failure ever show up, we have a pretty good clue for how to
768 fix it. */
770 return bb;
773 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
774 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
775 in order to create a jump. */
777 static rtx_insn *
778 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
779 rtx_code_label *label, int prob, rtx_insn *cinsn)
781 rtx_insn *seq;
782 rtx_jump_insn *jump;
783 rtx cond;
784 machine_mode mode;
786 mode = GET_MODE (op0);
787 if (mode == VOIDmode)
788 mode = GET_MODE (op1);
790 start_sequence ();
791 if (GET_MODE_CLASS (mode) == MODE_CC)
793 /* A hack -- there seems to be no easy generic way how to make a
794 conditional jump from a ccmode comparison. */
795 gcc_assert (cinsn);
796 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
797 gcc_assert (GET_CODE (cond) == comp);
798 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
799 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
800 emit_jump_insn (copy_insn (PATTERN (cinsn)));
801 jump = as_a <rtx_jump_insn *> (get_last_insn ());
802 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
803 LABEL_NUSES (JUMP_LABEL (jump))++;
804 redirect_jump (jump, label, 0);
806 else
808 gcc_assert (!cinsn);
810 op0 = force_operand (op0, NULL_RTX);
811 op1 = force_operand (op1, NULL_RTX);
812 do_compare_rtx_and_jump (op0, op1, comp, 0,
813 mode, NULL_RTX, NULL, label, -1);
814 jump = as_a <rtx_jump_insn *> (get_last_insn ());
815 jump->set_jump_target (label);
816 LABEL_NUSES (label)++;
818 add_int_reg_note (jump, REG_BR_PROB, prob);
820 seq = get_insns ();
821 end_sequence ();
823 return seq;
826 /* Unroll LOOP for which we are able to count number of iterations in runtime
827 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
828 extra care for case n < 0):
830 for (i = 0; i < n; i++)
831 body;
833 ==> (LOOP->LPT_DECISION.TIMES == 3)
835 i = 0;
836 mod = n % 4;
838 switch (mod)
840 case 3:
841 body; i++;
842 case 2:
843 body; i++;
844 case 1:
845 body; i++;
846 case 0: ;
849 while (i < n)
851 body; i++;
852 body; i++;
853 body; i++;
854 body; i++;
857 static void
858 unroll_loop_runtime_iterations (struct loop *loop)
860 rtx old_niter, niter, tmp;
861 rtx_insn *init_code, *branch_code;
862 unsigned i, j, p;
863 basic_block preheader, *body, swtch, ezc_swtch;
864 sbitmap wont_exit;
865 int may_exit_copy;
866 unsigned n_peel;
867 edge e;
868 bool extra_zero_check, last_may_exit;
869 unsigned max_unroll = loop->lpt_decision.times;
870 struct niter_desc *desc = get_simple_loop_desc (loop);
871 bool exit_at_end = loop_exit_at_end_p (loop);
872 struct opt_info *opt_info = NULL;
873 bool ok;
875 if (flag_split_ivs_in_unroller
876 || flag_variable_expansion_in_unroller)
877 opt_info = analyze_insns_in_loop (loop);
879 /* Remember blocks whose dominators will have to be updated. */
880 auto_vec<basic_block> dom_bbs;
882 body = get_loop_body (loop);
883 for (i = 0; i < loop->num_nodes; i++)
885 vec<basic_block> ldom;
886 basic_block bb;
888 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
889 FOR_EACH_VEC_ELT (ldom, j, bb)
890 if (!flow_bb_inside_loop_p (loop, bb))
891 dom_bbs.safe_push (bb);
893 ldom.release ();
895 free (body);
897 if (!exit_at_end)
899 /* Leave exit in first copy (for explanation why see comment in
900 unroll_loop_constant_iterations). */
901 may_exit_copy = 0;
902 n_peel = max_unroll - 1;
903 extra_zero_check = true;
904 last_may_exit = false;
906 else
908 /* Leave exit in last copy (for explanation why see comment in
909 unroll_loop_constant_iterations). */
910 may_exit_copy = max_unroll;
911 n_peel = max_unroll;
912 extra_zero_check = false;
913 last_may_exit = true;
916 /* Get expression for number of iterations. */
917 start_sequence ();
918 old_niter = niter = gen_reg_rtx (desc->mode);
919 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
920 if (tmp != niter)
921 emit_move_insn (niter, tmp);
923 /* Count modulo by ANDing it with max_unroll; we use the fact that
924 the number of unrollings is a power of two, and thus this is correct
925 even if there is overflow in the computation. */
926 niter = expand_simple_binop (desc->mode, AND,
927 niter, gen_int_mode (max_unroll, desc->mode),
928 NULL_RTX, 0, OPTAB_LIB_WIDEN);
930 init_code = get_insns ();
931 end_sequence ();
932 unshare_all_rtl_in_chain (init_code);
934 /* Precondition the loop. */
935 split_edge_and_insert (loop_preheader_edge (loop), init_code);
937 auto_vec<edge> remove_edges;
939 wont_exit = sbitmap_alloc (max_unroll + 2);
941 /* Peel the first copy of loop body (almost always we must leave exit test
942 here; the only exception is when we have extra zero check and the number
943 of iterations is reliable. Also record the place of (possible) extra
944 zero check. */
945 bitmap_clear (wont_exit);
946 if (extra_zero_check
947 && !desc->noloop_assumptions)
948 bitmap_set_bit (wont_exit, 1);
949 ezc_swtch = loop_preheader_edge (loop)->src;
950 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
951 1, wont_exit, desc->out_edge,
952 &remove_edges,
953 DLTHE_FLAG_UPDATE_FREQ);
954 gcc_assert (ok);
956 /* Record the place where switch will be built for preconditioning. */
957 swtch = split_edge (loop_preheader_edge (loop));
959 for (i = 0; i < n_peel; i++)
961 /* Peel the copy. */
962 bitmap_clear (wont_exit);
963 if (i != n_peel - 1 || !last_may_exit)
964 bitmap_set_bit (wont_exit, 1);
965 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
966 1, wont_exit, desc->out_edge,
967 &remove_edges,
968 DLTHE_FLAG_UPDATE_FREQ);
969 gcc_assert (ok);
971 /* Create item for switch. */
972 j = n_peel - i - (extra_zero_check ? 0 : 1);
973 p = REG_BR_PROB_BASE / (i + 2);
975 preheader = split_edge (loop_preheader_edge (loop));
976 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
977 block_label (preheader), p,
978 NULL);
980 /* We rely on the fact that the compare and jump cannot be optimized out,
981 and hence the cfg we create is correct. */
982 gcc_assert (branch_code != NULL_RTX);
984 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
985 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
986 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
987 e = make_edge (swtch, preheader,
988 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
989 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
990 e->probability = p;
993 if (extra_zero_check)
995 /* Add branch for zero iterations. */
996 p = REG_BR_PROB_BASE / (max_unroll + 1);
997 swtch = ezc_swtch;
998 preheader = split_edge (loop_preheader_edge (loop));
999 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1000 block_label (preheader), p,
1001 NULL);
1002 gcc_assert (branch_code != NULL_RTX);
1004 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1005 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1006 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1007 e = make_edge (swtch, preheader,
1008 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1009 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1010 e->probability = p;
1013 /* Recount dominators for outer blocks. */
1014 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1016 /* And unroll loop. */
1018 bitmap_ones (wont_exit);
1019 bitmap_clear_bit (wont_exit, may_exit_copy);
1020 opt_info_start_duplication (opt_info);
1022 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1023 max_unroll,
1024 wont_exit, desc->out_edge,
1025 &remove_edges,
1026 DLTHE_FLAG_UPDATE_FREQ
1027 | (opt_info
1028 ? DLTHE_RECORD_COPY_NUMBER
1029 : 0));
1030 gcc_assert (ok);
1032 if (opt_info)
1034 apply_opt_in_copies (opt_info, max_unroll, true, true);
1035 free_opt_info (opt_info);
1038 free (wont_exit);
1040 if (exit_at_end)
1042 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1043 /* Find a new in and out edge; they are in the last copy we have
1044 made. */
1046 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1048 desc->out_edge = EDGE_SUCC (exit_block, 0);
1049 desc->in_edge = EDGE_SUCC (exit_block, 1);
1051 else
1053 desc->out_edge = EDGE_SUCC (exit_block, 1);
1054 desc->in_edge = EDGE_SUCC (exit_block, 0);
1058 /* Remove the edges. */
1059 FOR_EACH_VEC_ELT (remove_edges, i, e)
1060 remove_path (e);
1062 /* We must be careful when updating the number of iterations due to
1063 preconditioning and the fact that the value must be valid at entry
1064 of the loop. After passing through the above code, we see that
1065 the correct new number of iterations is this: */
1066 gcc_assert (!desc->const_iter);
1067 desc->niter_expr =
1068 simplify_gen_binary (UDIV, desc->mode, old_niter,
1069 gen_int_mode (max_unroll + 1, desc->mode));
1070 loop->nb_iterations_upper_bound
1071 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1072 if (loop->any_estimate)
1073 loop->nb_iterations_estimate
1074 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1075 if (loop->any_likely_upper_bound)
1076 loop->nb_iterations_likely_upper_bound
1077 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1078 if (exit_at_end)
1080 desc->niter_expr =
1081 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1082 desc->noloop_assumptions = NULL_RTX;
1083 --loop->nb_iterations_upper_bound;
1084 if (loop->any_estimate
1085 && loop->nb_iterations_estimate != 0)
1086 --loop->nb_iterations_estimate;
1087 else
1088 loop->any_estimate = false;
1089 if (loop->any_likely_upper_bound
1090 && loop->nb_iterations_likely_upper_bound != 0)
1091 --loop->nb_iterations_likely_upper_bound;
1092 else
1093 loop->any_likely_upper_bound = false;
1096 if (dump_file)
1097 fprintf (dump_file,
1098 ";; Unrolled loop %d times, counting # of iterations "
1099 "in runtime, %i insns\n",
1100 max_unroll, num_loop_insns (loop));
1103 /* Decide whether to unroll LOOP stupidly and how much. */
1104 static void
1105 decide_unroll_stupid (struct loop *loop, int flags)
1107 unsigned nunroll, nunroll_by_av, i;
1108 struct niter_desc *desc;
1109 widest_int iterations;
1111 if (!(flags & UAP_UNROLL_ALL))
1113 /* We were not asked to, just return back silently. */
1114 return;
1117 if (dump_file)
1118 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1120 /* nunroll = total number of copies of the original loop body in
1121 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1122 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1123 nunroll_by_av
1124 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1125 if (nunroll > nunroll_by_av)
1126 nunroll = nunroll_by_av;
1127 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1128 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1130 if (targetm.loop_unroll_adjust)
1131 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1133 /* Skip big loops. */
1134 if (nunroll <= 1)
1136 if (dump_file)
1137 fprintf (dump_file, ";; Not considering loop, is too big\n");
1138 return;
1141 /* Check for simple loops. */
1142 desc = get_simple_loop_desc (loop);
1144 /* Check simpleness. */
1145 if (desc->simple_p && !desc->assumptions)
1147 if (dump_file)
1148 fprintf (dump_file, ";; The loop is simple\n");
1149 return;
1152 /* Do not unroll loops with branches inside -- it increases number
1153 of mispredicts.
1154 TODO: this heuristic needs tunning; call inside the loop body
1155 is also relatively good reason to not unroll. */
1156 if (num_loop_branches (loop) > 1)
1158 if (dump_file)
1159 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1160 return;
1163 /* Check whether the loop rolls. */
1164 if ((get_estimated_loop_iterations (loop, &iterations)
1165 || get_max_loop_iterations (loop, &iterations))
1166 && wi::ltu_p (iterations, 2 * nunroll))
1168 if (dump_file)
1169 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1170 return;
1173 /* Success. Now force nunroll to be power of 2, as it seems that this
1174 improves results (partially because of better alignments, partially
1175 because of some dark magic). */
1176 for (i = 1; 2 * i <= nunroll; i *= 2)
1177 continue;
1179 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1180 loop->lpt_decision.times = i - 1;
1183 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1185 while (cond)
1186 body;
1188 ==> (LOOP->LPT_DECISION.TIMES == 3)
1190 while (cond)
1192 body;
1193 if (!cond) break;
1194 body;
1195 if (!cond) break;
1196 body;
1197 if (!cond) break;
1198 body;
1201 static void
1202 unroll_loop_stupid (struct loop *loop)
1204 sbitmap wont_exit;
1205 unsigned nunroll = loop->lpt_decision.times;
1206 struct niter_desc *desc = get_simple_loop_desc (loop);
1207 struct opt_info *opt_info = NULL;
1208 bool ok;
1210 if (flag_split_ivs_in_unroller
1211 || flag_variable_expansion_in_unroller)
1212 opt_info = analyze_insns_in_loop (loop);
1215 wont_exit = sbitmap_alloc (nunroll + 1);
1216 bitmap_clear (wont_exit);
1217 opt_info_start_duplication (opt_info);
1219 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1220 nunroll, wont_exit,
1221 NULL, NULL,
1222 DLTHE_FLAG_UPDATE_FREQ
1223 | (opt_info
1224 ? DLTHE_RECORD_COPY_NUMBER
1225 : 0));
1226 gcc_assert (ok);
1228 if (opt_info)
1230 apply_opt_in_copies (opt_info, nunroll, true, true);
1231 free_opt_info (opt_info);
1234 free (wont_exit);
1236 if (desc->simple_p)
1238 /* We indeed may get here provided that there are nontrivial assumptions
1239 for a loop to be really simple. We could update the counts, but the
1240 problem is that we are unable to decide which exit will be taken
1241 (not really true in case the number of iterations is constant,
1242 but no one will do anything with this information, so we do not
1243 worry about it). */
1244 desc->simple_p = false;
1247 if (dump_file)
1248 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1249 nunroll, num_loop_insns (loop));
1252 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1253 Set *DEBUG_USES to the number of debug insns that reference the
1254 variable. */
1256 static bool
1257 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1258 int *debug_uses)
1260 basic_block *body, bb;
1261 unsigned i;
1262 int count_ref = 0;
1263 rtx_insn *insn;
1265 body = get_loop_body (loop);
1266 for (i = 0; i < loop->num_nodes; i++)
1268 bb = body[i];
1270 FOR_BB_INSNS (bb, insn)
1271 if (!rtx_referenced_p (reg, insn))
1272 continue;
1273 else if (DEBUG_INSN_P (insn))
1274 ++*debug_uses;
1275 else if (++count_ref > 1)
1276 break;
1278 free (body);
1279 return (count_ref == 1);
1282 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1284 static void
1285 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1287 basic_block *body, bb;
1288 unsigned i;
1289 rtx_insn *insn;
1291 body = get_loop_body (loop);
1292 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1294 bb = body[i];
1296 FOR_BB_INSNS (bb, insn)
1297 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1298 continue;
1299 else
1301 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1302 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1303 if (!--debug_uses)
1304 break;
1307 free (body);
1310 /* Determine whether INSN contains an accumulator
1311 which can be expanded into separate copies,
1312 one for each copy of the LOOP body.
1314 for (i = 0 ; i < n; i++)
1315 sum += a[i];
1319 sum += a[i]
1320 ....
1321 i = i+1;
1322 sum1 += a[i]
1323 ....
1324 i = i+1
1325 sum2 += a[i];
1326 ....
1328 Return NULL if INSN contains no opportunity for expansion of accumulator.
1329 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1330 information and return a pointer to it.
1333 static struct var_to_expand *
1334 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1336 rtx set, dest, src;
1337 struct var_to_expand *ves;
1338 unsigned accum_pos;
1339 enum rtx_code code;
1340 int debug_uses = 0;
1342 set = single_set (insn);
1343 if (!set)
1344 return NULL;
1346 dest = SET_DEST (set);
1347 src = SET_SRC (set);
1348 code = GET_CODE (src);
1350 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1351 return NULL;
1353 if (FLOAT_MODE_P (GET_MODE (dest)))
1355 if (!flag_associative_math)
1356 return NULL;
1357 /* In the case of FMA, we're also changing the rounding. */
1358 if (code == FMA && !flag_unsafe_math_optimizations)
1359 return NULL;
1362 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1363 in MD. But if there is no optab to generate the insn, we can not
1364 perform the variable expansion. This can happen if an MD provides
1365 an insn but not a named pattern to generate it, for example to avoid
1366 producing code that needs additional mode switches like for x87/mmx.
1368 So we check have_insn_for which looks for an optab for the operation
1369 in SRC. If it doesn't exist, we can't perform the expansion even
1370 though INSN is valid. */
1371 if (!have_insn_for (code, GET_MODE (src)))
1372 return NULL;
1374 if (!REG_P (dest)
1375 && !(GET_CODE (dest) == SUBREG
1376 && REG_P (SUBREG_REG (dest))))
1377 return NULL;
1379 /* Find the accumulator use within the operation. */
1380 if (code == FMA)
1382 /* We only support accumulation via FMA in the ADD position. */
1383 if (!rtx_equal_p (dest, XEXP (src, 2)))
1384 return NULL;
1385 accum_pos = 2;
1387 else if (rtx_equal_p (dest, XEXP (src, 0)))
1388 accum_pos = 0;
1389 else if (rtx_equal_p (dest, XEXP (src, 1)))
1391 /* The method of expansion that we are using; which includes the
1392 initialization of the expansions with zero and the summation of
1393 the expansions at the end of the computation will yield wrong
1394 results for (x = something - x) thus avoid using it in that case. */
1395 if (code == MINUS)
1396 return NULL;
1397 accum_pos = 1;
1399 else
1400 return NULL;
1402 /* It must not otherwise be used. */
1403 if (code == FMA)
1405 if (rtx_referenced_p (dest, XEXP (src, 0))
1406 || rtx_referenced_p (dest, XEXP (src, 1)))
1407 return NULL;
1409 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1410 return NULL;
1412 /* It must be used in exactly one insn. */
1413 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1414 return NULL;
1416 if (dump_file)
1418 fprintf (dump_file, "\n;; Expanding Accumulator ");
1419 print_rtl (dump_file, dest);
1420 fprintf (dump_file, "\n");
1423 if (debug_uses)
1424 /* Instead of resetting the debug insns, we could replace each
1425 debug use in the loop with the sum or product of all expanded
1426 accummulators. Since we'll only know of all expansions at the
1427 end, we'd have to keep track of which vars_to_expand a debug
1428 insn in the loop references, take note of each copy of the
1429 debug insn during unrolling, and when it's all done, compute
1430 the sum or product of each variable and adjust the original
1431 debug insn and each copy thereof. What a pain! */
1432 reset_debug_uses_in_loop (loop, dest, debug_uses);
1434 /* Record the accumulator to expand. */
1435 ves = XNEW (struct var_to_expand);
1436 ves->insn = insn;
1437 ves->reg = copy_rtx (dest);
1438 ves->var_expansions.create (1);
1439 ves->next = NULL;
1440 ves->op = GET_CODE (src);
1441 ves->expansion_count = 0;
1442 ves->reuse_expansion = 0;
1443 return ves;
1446 /* Determine whether there is an induction variable in INSN that
1447 we would like to split during unrolling.
1449 I.e. replace
1451 i = i + 1;
1453 i = i + 1;
1455 i = i + 1;
1458 type chains by
1460 i0 = i + 1
1462 i = i0 + 1
1464 i = i0 + 2
1467 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1468 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1469 pointer to it. */
1471 static struct iv_to_split *
1472 analyze_iv_to_split_insn (rtx_insn *insn)
1474 rtx set, dest;
1475 struct rtx_iv iv;
1476 struct iv_to_split *ivts;
1477 bool ok;
1479 /* For now we just split the basic induction variables. Later this may be
1480 extended for example by selecting also addresses of memory references. */
1481 set = single_set (insn);
1482 if (!set)
1483 return NULL;
1485 dest = SET_DEST (set);
1486 if (!REG_P (dest))
1487 return NULL;
1489 if (!biv_p (insn, dest))
1490 return NULL;
1492 ok = iv_analyze_result (insn, dest, &iv);
1494 /* This used to be an assert under the assumption that if biv_p returns
1495 true that iv_analyze_result must also return true. However, that
1496 assumption is not strictly correct as evidenced by pr25569.
1498 Returning NULL when iv_analyze_result returns false is safe and
1499 avoids the problems in pr25569 until the iv_analyze_* routines
1500 can be fixed, which is apparently hard and time consuming
1501 according to their author. */
1502 if (! ok)
1503 return NULL;
1505 if (iv.step == const0_rtx
1506 || iv.mode != iv.extend_mode)
1507 return NULL;
1509 /* Record the insn to split. */
1510 ivts = XNEW (struct iv_to_split);
1511 ivts->insn = insn;
1512 ivts->orig_var = dest;
1513 ivts->base_var = NULL_RTX;
1514 ivts->step = iv.step;
1515 ivts->next = NULL;
1517 return ivts;
1520 /* Determines which of insns in LOOP can be optimized.
1521 Return a OPT_INFO struct with the relevant hash tables filled
1522 with all insns to be optimized. The FIRST_NEW_BLOCK field
1523 is undefined for the return value. */
1525 static struct opt_info *
1526 analyze_insns_in_loop (struct loop *loop)
1528 basic_block *body, bb;
1529 unsigned i;
1530 struct opt_info *opt_info = XCNEW (struct opt_info);
1531 rtx_insn *insn;
1532 struct iv_to_split *ivts = NULL;
1533 struct var_to_expand *ves = NULL;
1534 iv_to_split **slot1;
1535 var_to_expand **slot2;
1536 vec<edge> edges = get_loop_exit_edges (loop);
1537 edge exit;
1538 bool can_apply = false;
1540 iv_analysis_loop_init (loop);
1542 body = get_loop_body (loop);
1544 if (flag_split_ivs_in_unroller)
1546 opt_info->insns_to_split
1547 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1548 opt_info->iv_to_split_head = NULL;
1549 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1552 /* Record the loop exit bb and loop preheader before the unrolling. */
1553 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1555 if (edges.length () == 1)
1557 exit = edges[0];
1558 if (!(exit->flags & EDGE_COMPLEX))
1560 opt_info->loop_exit = split_edge (exit);
1561 can_apply = true;
1565 if (flag_variable_expansion_in_unroller
1566 && can_apply)
1568 opt_info->insns_with_var_to_expand
1569 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1570 opt_info->var_to_expand_head = NULL;
1571 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1574 for (i = 0; i < loop->num_nodes; i++)
1576 bb = body[i];
1577 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1578 continue;
1580 FOR_BB_INSNS (bb, insn)
1582 if (!INSN_P (insn))
1583 continue;
1585 if (opt_info->insns_to_split)
1586 ivts = analyze_iv_to_split_insn (insn);
1588 if (ivts)
1590 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1591 gcc_assert (*slot1 == NULL);
1592 *slot1 = ivts;
1593 *opt_info->iv_to_split_tail = ivts;
1594 opt_info->iv_to_split_tail = &ivts->next;
1595 continue;
1598 if (opt_info->insns_with_var_to_expand)
1599 ves = analyze_insn_to_expand_var (loop, insn);
1601 if (ves)
1603 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1604 gcc_assert (*slot2 == NULL);
1605 *slot2 = ves;
1606 *opt_info->var_to_expand_tail = ves;
1607 opt_info->var_to_expand_tail = &ves->next;
1612 edges.release ();
1613 free (body);
1614 return opt_info;
1617 /* Called just before loop duplication. Records start of duplicated area
1618 to OPT_INFO. */
1620 static void
1621 opt_info_start_duplication (struct opt_info *opt_info)
1623 if (opt_info)
1624 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1627 /* Determine the number of iterations between initialization of the base
1628 variable and the current copy (N_COPY). N_COPIES is the total number
1629 of newly created copies. UNROLLING is true if we are unrolling
1630 (not peeling) the loop. */
1632 static unsigned
1633 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1635 if (unrolling)
1637 /* If we are unrolling, initialization is done in the original loop
1638 body (number 0). */
1639 return n_copy;
1641 else
1643 /* If we are peeling, the copy in that the initialization occurs has
1644 number 1. The original loop (number 0) is the last. */
1645 if (n_copy)
1646 return n_copy - 1;
1647 else
1648 return n_copies;
1652 /* Allocate basic variable for the induction variable chain. */
1654 static void
1655 allocate_basic_variable (struct iv_to_split *ivts)
1657 rtx expr = SET_SRC (single_set (ivts->insn));
1659 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1662 /* Insert initialization of basic variable of IVTS before INSN, taking
1663 the initial value from INSN. */
1665 static void
1666 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1668 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1669 rtx_insn *seq;
1671 start_sequence ();
1672 expr = force_operand (expr, ivts->base_var);
1673 if (expr != ivts->base_var)
1674 emit_move_insn (ivts->base_var, expr);
1675 seq = get_insns ();
1676 end_sequence ();
1678 emit_insn_before (seq, insn);
1681 /* Replace the use of induction variable described in IVTS in INSN
1682 by base variable + DELTA * step. */
1684 static void
1685 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1687 rtx expr, *loc, incr, var;
1688 rtx_insn *seq;
1689 machine_mode mode = GET_MODE (ivts->base_var);
1690 rtx src, dest, set;
1692 /* Construct base + DELTA * step. */
1693 if (!delta)
1694 expr = ivts->base_var;
1695 else
1697 incr = simplify_gen_binary (MULT, mode,
1698 ivts->step, gen_int_mode (delta, mode));
1699 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1700 ivts->base_var, incr);
1703 /* Figure out where to do the replacement. */
1704 loc = &SET_SRC (single_set (insn));
1706 /* If we can make the replacement right away, we're done. */
1707 if (validate_change (insn, loc, expr, 0))
1708 return;
1710 /* Otherwise, force EXPR into a register and try again. */
1711 start_sequence ();
1712 var = gen_reg_rtx (mode);
1713 expr = force_operand (expr, var);
1714 if (expr != var)
1715 emit_move_insn (var, expr);
1716 seq = get_insns ();
1717 end_sequence ();
1718 emit_insn_before (seq, insn);
1720 if (validate_change (insn, loc, var, 0))
1721 return;
1723 /* The last chance. Try recreating the assignment in insn
1724 completely from scratch. */
1725 set = single_set (insn);
1726 gcc_assert (set);
1728 start_sequence ();
1729 *loc = var;
1730 src = copy_rtx (SET_SRC (set));
1731 dest = copy_rtx (SET_DEST (set));
1732 src = force_operand (src, dest);
1733 if (src != dest)
1734 emit_move_insn (dest, src);
1735 seq = get_insns ();
1736 end_sequence ();
1738 emit_insn_before (seq, insn);
1739 delete_insn (insn);
1743 /* Return one expansion of the accumulator recorded in struct VE. */
1745 static rtx
1746 get_expansion (struct var_to_expand *ve)
1748 rtx reg;
1750 if (ve->reuse_expansion == 0)
1751 reg = ve->reg;
1752 else
1753 reg = ve->var_expansions[ve->reuse_expansion - 1];
1755 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1756 ve->reuse_expansion = 0;
1757 else
1758 ve->reuse_expansion++;
1760 return reg;
1764 /* Given INSN replace the uses of the accumulator recorded in VE
1765 with a new register. */
1767 static void
1768 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1770 rtx new_reg, set;
1771 bool really_new_expansion = false;
1773 set = single_set (insn);
1774 gcc_assert (set);
1776 /* Generate a new register only if the expansion limit has not been
1777 reached. Else reuse an already existing expansion. */
1778 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1780 really_new_expansion = true;
1781 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1783 else
1784 new_reg = get_expansion (ve);
1786 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1787 if (apply_change_group ())
1788 if (really_new_expansion)
1790 ve->var_expansions.safe_push (new_reg);
1791 ve->expansion_count++;
1795 /* Initialize the variable expansions in loop preheader. PLACE is the
1796 loop-preheader basic block where the initialization of the
1797 expansions should take place. The expansions are initialized with
1798 (-0) when the operation is plus or minus to honor sign zero. This
1799 way we can prevent cases where the sign of the final result is
1800 effected by the sign of the expansion. Here is an example to
1801 demonstrate this:
1803 for (i = 0 ; i < n; i++)
1804 sum += something;
1808 sum += something
1809 ....
1810 i = i+1;
1811 sum1 += something
1812 ....
1813 i = i+1
1814 sum2 += something;
1815 ....
1817 When SUM is initialized with -zero and SOMETHING is also -zero; the
1818 final result of sum should be -zero thus the expansions sum1 and sum2
1819 should be initialized with -zero as well (otherwise we will get +zero
1820 as the final result). */
1822 static void
1823 insert_var_expansion_initialization (struct var_to_expand *ve,
1824 basic_block place)
1826 rtx_insn *seq;
1827 rtx var, zero_init;
1828 unsigned i;
1829 machine_mode mode = GET_MODE (ve->reg);
1830 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1832 if (ve->var_expansions.length () == 0)
1833 return;
1835 start_sequence ();
1836 switch (ve->op)
1838 case FMA:
1839 /* Note that we only accumulate FMA via the ADD operand. */
1840 case PLUS:
1841 case MINUS:
1842 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1844 if (honor_signed_zero_p)
1845 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1846 else
1847 zero_init = CONST0_RTX (mode);
1848 emit_move_insn (var, zero_init);
1850 break;
1852 case MULT:
1853 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1855 zero_init = CONST1_RTX (GET_MODE (var));
1856 emit_move_insn (var, zero_init);
1858 break;
1860 default:
1861 gcc_unreachable ();
1864 seq = get_insns ();
1865 end_sequence ();
1867 emit_insn_after (seq, BB_END (place));
1870 /* Combine the variable expansions at the loop exit. PLACE is the
1871 loop exit basic block where the summation of the expansions should
1872 take place. */
1874 static void
1875 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1877 rtx sum = ve->reg;
1878 rtx expr, var;
1879 rtx_insn *seq, *insn;
1880 unsigned i;
1882 if (ve->var_expansions.length () == 0)
1883 return;
1885 start_sequence ();
1886 switch (ve->op)
1888 case FMA:
1889 /* Note that we only accumulate FMA via the ADD operand. */
1890 case PLUS:
1891 case MINUS:
1892 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1893 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1894 break;
1896 case MULT:
1897 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1898 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1899 break;
1901 default:
1902 gcc_unreachable ();
1905 expr = force_operand (sum, ve->reg);
1906 if (expr != ve->reg)
1907 emit_move_insn (ve->reg, expr);
1908 seq = get_insns ();
1909 end_sequence ();
1911 insn = BB_HEAD (place);
1912 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1913 insn = NEXT_INSN (insn);
1915 emit_insn_after (seq, insn);
1918 /* Strip away REG_EQUAL notes for IVs we're splitting.
1920 Updating REG_EQUAL notes for IVs we split is tricky: We
1921 cannot tell until after unrolling, DF-rescanning, and liveness
1922 updating, whether an EQ_USE is reached by the split IV while
1923 the IV reg is still live. See PR55006.
1925 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1926 because RTL loop-iv requires us to defer rescanning insns and
1927 any notes attached to them. So resort to old techniques... */
1929 static void
1930 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1932 struct iv_to_split *ivts;
1933 rtx note = find_reg_equal_equiv_note (insn);
1934 if (! note)
1935 return;
1936 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1937 if (reg_mentioned_p (ivts->orig_var, note))
1939 remove_note (insn, note);
1940 return;
1944 /* Apply loop optimizations in loop copies using the
1945 data which gathered during the unrolling. Structure
1946 OPT_INFO record that data.
1948 UNROLLING is true if we unrolled (not peeled) the loop.
1949 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1950 the loop (as it should happen in complete unrolling, but not in ordinary
1951 peeling of the loop). */
1953 static void
1954 apply_opt_in_copies (struct opt_info *opt_info,
1955 unsigned n_copies, bool unrolling,
1956 bool rewrite_original_loop)
1958 unsigned i, delta;
1959 basic_block bb, orig_bb;
1960 rtx_insn *insn, *orig_insn, *next;
1961 struct iv_to_split ivts_templ, *ivts;
1962 struct var_to_expand ve_templ, *ves;
1964 /* Sanity check -- we need to put initialization in the original loop
1965 body. */
1966 gcc_assert (!unrolling || rewrite_original_loop);
1968 /* Allocate the basic variables (i0). */
1969 if (opt_info->insns_to_split)
1970 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1971 allocate_basic_variable (ivts);
1973 for (i = opt_info->first_new_block;
1974 i < (unsigned) last_basic_block_for_fn (cfun);
1975 i++)
1977 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1978 orig_bb = get_bb_original (bb);
1980 /* bb->aux holds position in copy sequence initialized by
1981 duplicate_loop_to_header_edge. */
1982 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1983 unrolling);
1984 bb->aux = 0;
1985 orig_insn = BB_HEAD (orig_bb);
1986 FOR_BB_INSNS_SAFE (bb, insn, next)
1988 if (!INSN_P (insn)
1989 || (DEBUG_INSN_P (insn)
1990 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1991 continue;
1993 while (!INSN_P (orig_insn)
1994 || (DEBUG_INSN_P (orig_insn)
1995 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1996 == LABEL_DECL)))
1997 orig_insn = NEXT_INSN (orig_insn);
1999 ivts_templ.insn = orig_insn;
2000 ve_templ.insn = orig_insn;
2002 /* Apply splitting iv optimization. */
2003 if (opt_info->insns_to_split)
2005 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2007 ivts = opt_info->insns_to_split->find (&ivts_templ);
2009 if (ivts)
2011 gcc_assert (GET_CODE (PATTERN (insn))
2012 == GET_CODE (PATTERN (orig_insn)));
2014 if (!delta)
2015 insert_base_initialization (ivts, insn);
2016 split_iv (ivts, insn, delta);
2019 /* Apply variable expansion optimization. */
2020 if (unrolling && opt_info->insns_with_var_to_expand)
2022 ves = (struct var_to_expand *)
2023 opt_info->insns_with_var_to_expand->find (&ve_templ);
2024 if (ves)
2026 gcc_assert (GET_CODE (PATTERN (insn))
2027 == GET_CODE (PATTERN (orig_insn)));
2028 expand_var_during_unrolling (ves, insn);
2031 orig_insn = NEXT_INSN (orig_insn);
2035 if (!rewrite_original_loop)
2036 return;
2038 /* Initialize the variable expansions in the loop preheader
2039 and take care of combining them at the loop exit. */
2040 if (opt_info->insns_with_var_to_expand)
2042 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2043 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2044 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2045 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2048 /* Rewrite also the original loop body. Find them as originals of the blocks
2049 in the last copied iteration, i.e. those that have
2050 get_bb_copy (get_bb_original (bb)) == bb. */
2051 for (i = opt_info->first_new_block;
2052 i < (unsigned) last_basic_block_for_fn (cfun);
2053 i++)
2055 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2056 orig_bb = get_bb_original (bb);
2057 if (get_bb_copy (orig_bb) != bb)
2058 continue;
2060 delta = determine_split_iv_delta (0, n_copies, unrolling);
2061 for (orig_insn = BB_HEAD (orig_bb);
2062 orig_insn != NEXT_INSN (BB_END (bb));
2063 orig_insn = next)
2065 next = NEXT_INSN (orig_insn);
2067 if (!INSN_P (orig_insn))
2068 continue;
2070 ivts_templ.insn = orig_insn;
2071 if (opt_info->insns_to_split)
2073 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2075 ivts = (struct iv_to_split *)
2076 opt_info->insns_to_split->find (&ivts_templ);
2077 if (ivts)
2079 if (!delta)
2080 insert_base_initialization (ivts, orig_insn);
2081 split_iv (ivts, orig_insn, delta);
2082 continue;
2090 /* Release OPT_INFO. */
2092 static void
2093 free_opt_info (struct opt_info *opt_info)
2095 delete opt_info->insns_to_split;
2096 opt_info->insns_to_split = NULL;
2097 if (opt_info->insns_with_var_to_expand)
2099 struct var_to_expand *ves;
2101 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2102 ves->var_expansions.release ();
2103 delete opt_info->insns_with_var_to_expand;
2104 opt_info->insns_with_var_to_expand = NULL;
2106 free (opt_info);