match_asm_constraints: Use copy_rtx where needed (PR88001)
[official-gcc.git] / gcc / loop-unroll.c
blob5b37052b5485ed10efc63deb91c2df6ad7ab8015
1 /* Loop unrolling.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "memmodel.h"
29 #include "optabs.h"
30 #include "emit-rtl.h"
31 #include "recog.h"
32 #include "profile.h"
33 #include "cfgrtl.h"
34 #include "cfgloop.h"
35 #include "params.h"
36 #include "dojump.h"
37 #include "expr.h"
38 #include "dumpfile.h"
40 /* This pass performs loop unrolling. We only perform this
41 optimization on innermost loops (with single exception) because
42 the impact on performance is greatest here, and we want to avoid
43 unnecessary code size growth. The gain is caused by greater sequentiality
44 of code, better code to optimize for further passes and in some cases
45 by fewer testings of exit conditions. The main problem is code growth,
46 that impacts performance negatively due to effect of caches.
48 What we do:
50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
61 There is a lot of parameters (defined and described in params.def) that
62 control how much we unroll.
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
69 /* Information about induction variables to split. */
71 struct iv_to_split
73 rtx_insn *insn; /* The insn in that the induction variable occurs. */
74 rtx orig_var; /* The variable (register) for the IV before split. */
75 rtx base_var; /* The variable on that the values in the further
76 iterations are based. */
77 rtx step; /* Step of the induction variable. */
78 struct iv_to_split *next; /* Next entry in walking order. */
81 /* Information about accumulators to expand. */
83 struct var_to_expand
85 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
86 rtx reg; /* The accumulator which is expanded. */
87 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
88 struct var_to_expand *next; /* Next entry in walking order. */
89 enum rtx_code op; /* The type of the accumulation - addition, subtraction
90 or multiplication. */
91 int expansion_count; /* Count the number of expansions generated so far. */
92 int reuse_expansion; /* The expansion we intend to reuse to expand
93 the accumulator. If REUSE_EXPANSION is 0 reuse
94 the original accumulator. Else use
95 var_expansions[REUSE_EXPANSION - 1]. */
98 /* Hashtable helper for iv_to_split. */
100 struct iv_split_hasher : free_ptr_hash <iv_to_split>
102 static inline hashval_t hash (const iv_to_split *);
103 static inline bool equal (const iv_to_split *, const iv_to_split *);
107 /* A hash function for information about insns to split. */
109 inline hashval_t
110 iv_split_hasher::hash (const iv_to_split *ivts)
112 return (hashval_t) INSN_UID (ivts->insn);
115 /* An equality functions for information about insns to split. */
117 inline bool
118 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
120 return i1->insn == i2->insn;
123 /* Hashtable helper for iv_to_split. */
125 struct var_expand_hasher : free_ptr_hash <var_to_expand>
127 static inline hashval_t hash (const var_to_expand *);
128 static inline bool equal (const var_to_expand *, const var_to_expand *);
131 /* Return a hash for VES. */
133 inline hashval_t
134 var_expand_hasher::hash (const var_to_expand *ves)
136 return (hashval_t) INSN_UID (ves->insn);
139 /* Return true if I1 and I2 refer to the same instruction. */
141 inline bool
142 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
144 return i1->insn == i2->insn;
147 /* Information about optimization applied in
148 the unrolled loop. */
150 struct opt_info
152 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
153 split. */
154 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
155 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
156 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
157 insns with accumulators to expand. */
158 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
159 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
160 unsigned first_new_block; /* The first basic block that was
161 duplicated. */
162 basic_block loop_exit; /* The loop exit basic block. */
163 basic_block loop_preheader; /* The loop preheader basic block. */
166 static void decide_unroll_stupid (struct loop *, int);
167 static void decide_unroll_constant_iterations (struct loop *, int);
168 static void decide_unroll_runtime_iterations (struct loop *, int);
169 static void unroll_loop_stupid (struct loop *);
170 static void decide_unrolling (int);
171 static void unroll_loop_constant_iterations (struct loop *);
172 static void unroll_loop_runtime_iterations (struct loop *);
173 static struct opt_info *analyze_insns_in_loop (struct loop *);
174 static void opt_info_start_duplication (struct opt_info *);
175 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
176 static void free_opt_info (struct opt_info *);
177 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
178 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
179 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
180 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
181 static void insert_var_expansion_initialization (struct var_to_expand *,
182 basic_block);
183 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
184 basic_block);
185 static rtx get_expansion (struct var_to_expand *);
187 /* Emit a message summarizing the unroll that will be
188 performed for LOOP, along with the loop's location LOCUS, if
189 appropriate given the dump or -fopt-info settings. */
191 static void
192 report_unroll (struct loop *loop, dump_location_t locus)
194 dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
196 if (loop->lpt_decision.decision == LPT_NONE)
197 return;
199 if (!dump_enabled_p ())
200 return;
202 dump_metadata_t metadata (report_flags, locus.get_impl_location ());
203 dump_printf_loc (metadata, locus.get_user_location (),
204 "loop unrolled %d times",
205 loop->lpt_decision.times);
206 if (profile_info && loop->header->count.initialized_p ())
207 dump_printf (metadata,
208 " (header execution count %d)",
209 (int)loop->header->count.to_gcov_type ());
211 dump_printf (metadata, "\n");
214 /* Decide whether unroll loops and how much. */
215 static void
216 decide_unrolling (int flags)
218 struct loop *loop;
220 /* Scan the loops, inner ones first. */
221 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
223 loop->lpt_decision.decision = LPT_NONE;
224 dump_user_location_t locus = get_loop_location (loop);
226 if (dump_enabled_p ())
227 dump_printf_loc (MSG_NOTE, locus,
228 "considering unrolling loop %d at BB %d\n",
229 loop->num, loop->header->index);
231 if (loop->unroll == 1)
233 if (dump_file)
234 fprintf (dump_file,
235 ";; Not unrolling loop, user didn't want it unrolled\n");
236 continue;
239 /* Do not peel cold areas. */
240 if (optimize_loop_for_size_p (loop))
242 if (dump_file)
243 fprintf (dump_file, ";; Not considering loop, cold area\n");
244 continue;
247 /* Can the loop be manipulated? */
248 if (!can_duplicate_loop_p (loop))
250 if (dump_file)
251 fprintf (dump_file,
252 ";; Not considering loop, cannot duplicate\n");
253 continue;
256 /* Skip non-innermost loops. */
257 if (loop->inner)
259 if (dump_file)
260 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
261 continue;
264 loop->ninsns = num_loop_insns (loop);
265 loop->av_ninsns = average_num_loop_insns (loop);
267 /* Try transformations one by one in decreasing order of priority. */
268 decide_unroll_constant_iterations (loop, flags);
269 if (loop->lpt_decision.decision == LPT_NONE)
270 decide_unroll_runtime_iterations (loop, flags);
271 if (loop->lpt_decision.decision == LPT_NONE)
272 decide_unroll_stupid (loop, flags);
274 report_unroll (loop, locus);
278 /* Unroll LOOPS. */
279 void
280 unroll_loops (int flags)
282 struct loop *loop;
283 bool changed = false;
285 /* Now decide rest of unrolling. */
286 decide_unrolling (flags);
288 /* Scan the loops, inner ones first. */
289 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
291 /* And perform the appropriate transformations. */
292 switch (loop->lpt_decision.decision)
294 case LPT_UNROLL_CONSTANT:
295 unroll_loop_constant_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_RUNTIME:
299 unroll_loop_runtime_iterations (loop);
300 changed = true;
301 break;
302 case LPT_UNROLL_STUPID:
303 unroll_loop_stupid (loop);
304 changed = true;
305 break;
306 case LPT_NONE:
307 break;
308 default:
309 gcc_unreachable ();
313 if (changed)
315 calculate_dominance_info (CDI_DOMINATORS);
316 fix_loop_structure (NULL);
319 iv_analysis_done ();
322 /* Check whether exit of the LOOP is at the end of loop body. */
324 static bool
325 loop_exit_at_end_p (struct loop *loop)
327 struct niter_desc *desc = get_simple_loop_desc (loop);
328 rtx_insn *insn;
330 /* We should never have conditional in latch block. */
331 gcc_assert (desc->in_edge->dest != loop->header);
333 if (desc->in_edge->dest != loop->latch)
334 return false;
336 /* Check that the latch is empty. */
337 FOR_BB_INSNS (loop->latch, insn)
339 if (INSN_P (insn) && active_insn_p (insn))
340 return false;
343 return true;
346 /* Decide whether to unroll LOOP iterating constant number of times
347 and how much. */
349 static void
350 decide_unroll_constant_iterations (struct loop *loop, int flags)
352 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
353 struct niter_desc *desc;
354 widest_int iterations;
356 /* If we were not asked to unroll this loop, just return back silently. */
357 if (!(flags & UAP_UNROLL) && !loop->unroll)
358 return;
360 if (dump_enabled_p ())
361 dump_printf (MSG_NOTE,
362 "considering unrolling loop with constant "
363 "number of iterations\n");
365 /* nunroll = total number of copies of the original loop body in
366 unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
367 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
368 nunroll_by_av
369 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
370 if (nunroll > nunroll_by_av)
371 nunroll = nunroll_by_av;
372 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
373 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
375 if (targetm.loop_unroll_adjust)
376 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
378 /* Skip big loops. */
379 if (nunroll <= 1)
381 if (dump_file)
382 fprintf (dump_file, ";; Not considering loop, is too big\n");
383 return;
386 /* Check for simple loops. */
387 desc = get_simple_loop_desc (loop);
389 /* Check number of iterations. */
390 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
392 if (dump_file)
393 fprintf (dump_file,
394 ";; Unable to prove that the loop iterates constant times\n");
395 return;
398 /* Check for an explicit unrolling factor. */
399 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
401 /* However we cannot unroll completely at the RTL level a loop with
402 constant number of iterations; it should have been peeled instead. */
403 if ((unsigned) loop->unroll - 1 > desc->niter - 2)
405 if (dump_file)
406 fprintf (dump_file, ";; Loop should have been peeled\n");
408 else
410 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
411 loop->lpt_decision.times = loop->unroll - 1;
413 return;
416 /* Check whether the loop rolls enough to consider.
417 Consult also loop bounds and profile; in the case the loop has more
418 than one exit it may well loop less than determined maximal number
419 of iterations. */
420 if (desc->niter < 2 * nunroll
421 || ((get_estimated_loop_iterations (loop, &iterations)
422 || get_likely_max_loop_iterations (loop, &iterations))
423 && wi::ltu_p (iterations, 2 * nunroll)))
425 if (dump_file)
426 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
427 return;
430 /* Success; now compute number of iterations to unroll. We alter
431 nunroll so that as few as possible copies of loop body are
432 necessary, while still not decreasing the number of unrollings
433 too much (at most by 1). */
434 best_copies = 2 * nunroll + 10;
436 i = 2 * nunroll + 2;
437 if (i > desc->niter - 2)
438 i = desc->niter - 2;
440 for (; i >= nunroll - 1; i--)
442 unsigned exit_mod = desc->niter % (i + 1);
444 if (!loop_exit_at_end_p (loop))
445 n_copies = exit_mod + i + 1;
446 else if (exit_mod != (unsigned) i
447 || desc->noloop_assumptions != NULL_RTX)
448 n_copies = exit_mod + i + 2;
449 else
450 n_copies = i + 1;
452 if (n_copies < best_copies)
454 best_copies = n_copies;
455 best_unroll = i;
459 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
460 loop->lpt_decision.times = best_unroll;
463 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
464 The transformation does this:
466 for (i = 0; i < 102; i++)
467 body;
469 ==> (LOOP->LPT_DECISION.TIMES == 3)
471 i = 0;
472 body; i++;
473 body; i++;
474 while (i < 102)
476 body; i++;
477 body; i++;
478 body; i++;
479 body; i++;
482 static void
483 unroll_loop_constant_iterations (struct loop *loop)
485 unsigned HOST_WIDE_INT niter;
486 unsigned exit_mod;
487 unsigned i;
488 edge e;
489 unsigned max_unroll = loop->lpt_decision.times;
490 struct niter_desc *desc = get_simple_loop_desc (loop);
491 bool exit_at_end = loop_exit_at_end_p (loop);
492 struct opt_info *opt_info = NULL;
493 bool ok;
495 niter = desc->niter;
497 /* Should not get here (such loop should be peeled instead). */
498 gcc_assert (niter > max_unroll + 1);
500 exit_mod = niter % (max_unroll + 1);
502 auto_sbitmap wont_exit (max_unroll + 2);
503 bitmap_ones (wont_exit);
505 auto_vec<edge> remove_edges;
506 if (flag_split_ivs_in_unroller
507 || flag_variable_expansion_in_unroller)
508 opt_info = analyze_insns_in_loop (loop);
510 if (!exit_at_end)
512 /* The exit is not at the end of the loop; leave exit test
513 in the first copy, so that the loops that start with test
514 of exit condition have continuous body after unrolling. */
516 if (dump_file)
517 fprintf (dump_file, ";; Condition at beginning of loop.\n");
519 /* Peel exit_mod iterations. */
520 bitmap_clear_bit (wont_exit, 0);
521 if (desc->noloop_assumptions)
522 bitmap_clear_bit (wont_exit, 1);
524 if (exit_mod)
526 opt_info_start_duplication (opt_info);
527 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
528 exit_mod,
529 wont_exit, desc->out_edge,
530 &remove_edges,
531 DLTHE_FLAG_UPDATE_FREQ
532 | (opt_info && exit_mod > 1
533 ? DLTHE_RECORD_COPY_NUMBER
534 : 0));
535 gcc_assert (ok);
537 if (opt_info && exit_mod > 1)
538 apply_opt_in_copies (opt_info, exit_mod, false, false);
540 desc->noloop_assumptions = NULL_RTX;
541 desc->niter -= exit_mod;
542 loop->nb_iterations_upper_bound -= exit_mod;
543 if (loop->any_estimate
544 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
545 loop->nb_iterations_estimate -= exit_mod;
546 else
547 loop->any_estimate = false;
548 if (loop->any_likely_upper_bound
549 && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
550 loop->nb_iterations_likely_upper_bound -= exit_mod;
551 else
552 loop->any_likely_upper_bound = false;
555 bitmap_set_bit (wont_exit, 1);
557 else
559 /* Leave exit test in last copy, for the same reason as above if
560 the loop tests the condition at the end of loop body. */
562 if (dump_file)
563 fprintf (dump_file, ";; Condition at end of loop.\n");
565 /* We know that niter >= max_unroll + 2; so we do not need to care of
566 case when we would exit before reaching the loop. So just peel
567 exit_mod + 1 iterations. */
568 if (exit_mod != max_unroll
569 || desc->noloop_assumptions)
571 bitmap_clear_bit (wont_exit, 0);
572 if (desc->noloop_assumptions)
573 bitmap_clear_bit (wont_exit, 1);
575 opt_info_start_duplication (opt_info);
576 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
577 exit_mod + 1,
578 wont_exit, desc->out_edge,
579 &remove_edges,
580 DLTHE_FLAG_UPDATE_FREQ
581 | (opt_info && exit_mod > 0
582 ? DLTHE_RECORD_COPY_NUMBER
583 : 0));
584 gcc_assert (ok);
586 if (opt_info && exit_mod > 0)
587 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
589 desc->niter -= exit_mod + 1;
590 loop->nb_iterations_upper_bound -= exit_mod + 1;
591 if (loop->any_estimate
592 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
593 loop->nb_iterations_estimate -= exit_mod + 1;
594 else
595 loop->any_estimate = false;
596 if (loop->any_likely_upper_bound
597 && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
598 loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
599 else
600 loop->any_likely_upper_bound = false;
601 desc->noloop_assumptions = NULL_RTX;
603 bitmap_set_bit (wont_exit, 0);
604 bitmap_set_bit (wont_exit, 1);
607 bitmap_clear_bit (wont_exit, max_unroll);
610 /* Now unroll the loop. */
612 opt_info_start_duplication (opt_info);
613 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
614 max_unroll,
615 wont_exit, desc->out_edge,
616 &remove_edges,
617 DLTHE_FLAG_UPDATE_FREQ
618 | (opt_info
619 ? DLTHE_RECORD_COPY_NUMBER
620 : 0));
621 gcc_assert (ok);
623 if (opt_info)
625 apply_opt_in_copies (opt_info, max_unroll, true, true);
626 free_opt_info (opt_info);
629 if (exit_at_end)
631 basic_block exit_block = get_bb_copy (desc->in_edge->src);
632 /* Find a new in and out edge; they are in the last copy we have made. */
634 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
636 desc->out_edge = EDGE_SUCC (exit_block, 0);
637 desc->in_edge = EDGE_SUCC (exit_block, 1);
639 else
641 desc->out_edge = EDGE_SUCC (exit_block, 1);
642 desc->in_edge = EDGE_SUCC (exit_block, 0);
646 desc->niter /= max_unroll + 1;
647 loop->nb_iterations_upper_bound
648 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
649 if (loop->any_estimate)
650 loop->nb_iterations_estimate
651 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
652 if (loop->any_likely_upper_bound)
653 loop->nb_iterations_likely_upper_bound
654 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
655 desc->niter_expr = GEN_INT (desc->niter);
657 /* Remove the edges. */
658 FOR_EACH_VEC_ELT (remove_edges, i, e)
659 remove_path (e);
661 if (dump_file)
662 fprintf (dump_file,
663 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
664 max_unroll, num_loop_insns (loop));
667 /* Decide whether to unroll LOOP iterating runtime computable number of times
668 and how much. */
669 static void
670 decide_unroll_runtime_iterations (struct loop *loop, int flags)
672 unsigned nunroll, nunroll_by_av, i;
673 struct niter_desc *desc;
674 widest_int iterations;
676 /* If we were not asked to unroll this loop, just return back silently. */
677 if (!(flags & UAP_UNROLL) && !loop->unroll)
678 return;
680 if (dump_enabled_p ())
681 dump_printf (MSG_NOTE,
682 "considering unrolling loop with runtime-"
683 "computable number of iterations\n");
685 /* nunroll = total number of copies of the original loop body in
686 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
687 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
688 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
689 if (nunroll > nunroll_by_av)
690 nunroll = nunroll_by_av;
691 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
692 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
694 if (targetm.loop_unroll_adjust)
695 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
697 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
698 nunroll = loop->unroll;
700 /* Skip big loops. */
701 if (nunroll <= 1)
703 if (dump_file)
704 fprintf (dump_file, ";; Not considering loop, is too big\n");
705 return;
708 /* Check for simple loops. */
709 desc = get_simple_loop_desc (loop);
711 /* Check simpleness. */
712 if (!desc->simple_p || desc->assumptions)
714 if (dump_file)
715 fprintf (dump_file,
716 ";; Unable to prove that the number of iterations "
717 "can be counted in runtime\n");
718 return;
721 if (desc->const_iter)
723 if (dump_file)
724 fprintf (dump_file, ";; Loop iterates constant times\n");
725 return;
728 /* Check whether the loop rolls. */
729 if ((get_estimated_loop_iterations (loop, &iterations)
730 || get_likely_max_loop_iterations (loop, &iterations))
731 && wi::ltu_p (iterations, 2 * nunroll))
733 if (dump_file)
734 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
735 return;
738 /* Success; now force nunroll to be power of 2, as code-gen
739 requires it, we are unable to cope with overflows in
740 computation of number of iterations. */
741 for (i = 1; 2 * i <= nunroll; i *= 2)
742 continue;
744 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
745 loop->lpt_decision.times = i - 1;
748 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
749 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
750 and NULL is returned instead. */
752 basic_block
753 split_edge_and_insert (edge e, rtx_insn *insns)
755 basic_block bb;
757 if (!insns)
758 return NULL;
759 bb = split_edge (e);
760 emit_insn_after (insns, BB_END (bb));
762 /* ??? We used to assume that INSNS can contain control flow insns, and
763 that we had to try to find sub basic blocks in BB to maintain a valid
764 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
765 and call break_superblocks when going out of cfglayout mode. But it
766 turns out that this never happens; and that if it does ever happen,
767 the verify_flow_info at the end of the RTL loop passes would fail.
769 There are two reasons why we expected we could have control flow insns
770 in INSNS. The first is when a comparison has to be done in parts, and
771 the second is when the number of iterations is computed for loops with
772 the number of iterations known at runtime. In both cases, test cases
773 to get control flow in INSNS appear to be impossible to construct:
775 * If do_compare_rtx_and_jump needs several branches to do comparison
776 in a mode that needs comparison by parts, we cannot analyze the
777 number of iterations of the loop, and we never get to unrolling it.
779 * The code in expand_divmod that was suspected to cause creation of
780 branching code seems to be only accessed for signed division. The
781 divisions used by # of iterations analysis are always unsigned.
782 Problems might arise on architectures that emits branching code
783 for some operations that may appear in the unroller (especially
784 for division), but we have no such architectures.
786 Considering all this, it was decided that we should for now assume
787 that INSNS can in theory contain control flow insns, but in practice
788 it never does. So we don't handle the theoretical case, and should
789 a real failure ever show up, we have a pretty good clue for how to
790 fix it. */
792 return bb;
795 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
796 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
797 in order to create a jump. */
799 static rtx_insn *
800 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
801 rtx_code_label *label, profile_probability prob,
802 rtx_insn *cinsn)
804 rtx_insn *seq;
805 rtx_jump_insn *jump;
806 rtx cond;
807 machine_mode mode;
809 mode = GET_MODE (op0);
810 if (mode == VOIDmode)
811 mode = GET_MODE (op1);
813 start_sequence ();
814 if (GET_MODE_CLASS (mode) == MODE_CC)
816 /* A hack -- there seems to be no easy generic way how to make a
817 conditional jump from a ccmode comparison. */
818 gcc_assert (cinsn);
819 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
820 gcc_assert (GET_CODE (cond) == comp);
821 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
822 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
823 emit_jump_insn (copy_insn (PATTERN (cinsn)));
824 jump = as_a <rtx_jump_insn *> (get_last_insn ());
825 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
826 LABEL_NUSES (JUMP_LABEL (jump))++;
827 redirect_jump (jump, label, 0);
829 else
831 gcc_assert (!cinsn);
833 op0 = force_operand (op0, NULL_RTX);
834 op1 = force_operand (op1, NULL_RTX);
835 do_compare_rtx_and_jump (op0, op1, comp, 0,
836 mode, NULL_RTX, NULL, label,
837 profile_probability::uninitialized ());
838 jump = as_a <rtx_jump_insn *> (get_last_insn ());
839 jump->set_jump_target (label);
840 LABEL_NUSES (label)++;
842 if (prob.initialized_p ())
843 add_reg_br_prob_note (jump, prob);
845 seq = get_insns ();
846 end_sequence ();
848 return seq;
851 /* Unroll LOOP for which we are able to count number of iterations in
852 runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
853 power of two. The transformation does this (with some extra care
854 for case n < 0):
856 for (i = 0; i < n; i++)
857 body;
859 ==> (LOOP->LPT_DECISION.TIMES == 3)
861 i = 0;
862 mod = n % 4;
864 switch (mod)
866 case 3:
867 body; i++;
868 case 2:
869 body; i++;
870 case 1:
871 body; i++;
872 case 0: ;
875 while (i < n)
877 body; i++;
878 body; i++;
879 body; i++;
880 body; i++;
883 static void
884 unroll_loop_runtime_iterations (struct loop *loop)
886 rtx old_niter, niter, tmp;
887 rtx_insn *init_code, *branch_code;
888 unsigned i, j;
889 profile_probability p;
890 basic_block preheader, *body, swtch, ezc_swtch = NULL;
891 int may_exit_copy;
892 profile_count iter_count, new_count;
893 unsigned n_peel;
894 edge e;
895 bool extra_zero_check, last_may_exit;
896 unsigned max_unroll = loop->lpt_decision.times;
897 struct niter_desc *desc = get_simple_loop_desc (loop);
898 bool exit_at_end = loop_exit_at_end_p (loop);
899 struct opt_info *opt_info = NULL;
900 bool ok;
902 if (flag_split_ivs_in_unroller
903 || flag_variable_expansion_in_unroller)
904 opt_info = analyze_insns_in_loop (loop);
906 /* Remember blocks whose dominators will have to be updated. */
907 auto_vec<basic_block> dom_bbs;
909 body = get_loop_body (loop);
910 for (i = 0; i < loop->num_nodes; i++)
912 vec<basic_block> ldom;
913 basic_block bb;
915 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
916 FOR_EACH_VEC_ELT (ldom, j, bb)
917 if (!flow_bb_inside_loop_p (loop, bb))
918 dom_bbs.safe_push (bb);
920 ldom.release ();
922 free (body);
924 if (!exit_at_end)
926 /* Leave exit in first copy (for explanation why see comment in
927 unroll_loop_constant_iterations). */
928 may_exit_copy = 0;
929 n_peel = max_unroll - 1;
930 extra_zero_check = true;
931 last_may_exit = false;
933 else
935 /* Leave exit in last copy (for explanation why see comment in
936 unroll_loop_constant_iterations). */
937 may_exit_copy = max_unroll;
938 n_peel = max_unroll;
939 extra_zero_check = false;
940 last_may_exit = true;
943 /* Get expression for number of iterations. */
944 start_sequence ();
945 old_niter = niter = gen_reg_rtx (desc->mode);
946 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
947 if (tmp != niter)
948 emit_move_insn (niter, tmp);
950 /* For loops that exit at end and whose number of iterations is reliable,
951 add one to niter to account for first pass through loop body before
952 reaching exit test. */
953 if (exit_at_end && !desc->noloop_assumptions)
955 niter = expand_simple_binop (desc->mode, PLUS,
956 niter, const1_rtx,
957 NULL_RTX, 0, OPTAB_LIB_WIDEN);
958 old_niter = niter;
961 /* Count modulo by ANDing it with max_unroll; we use the fact that
962 the number of unrollings is a power of two, and thus this is correct
963 even if there is overflow in the computation. */
964 niter = expand_simple_binop (desc->mode, AND,
965 niter, gen_int_mode (max_unroll, desc->mode),
966 NULL_RTX, 0, OPTAB_LIB_WIDEN);
968 init_code = get_insns ();
969 end_sequence ();
970 unshare_all_rtl_in_chain (init_code);
972 /* Precondition the loop. */
973 split_edge_and_insert (loop_preheader_edge (loop), init_code);
975 auto_vec<edge> remove_edges;
977 auto_sbitmap wont_exit (max_unroll + 2);
979 if (extra_zero_check || desc->noloop_assumptions)
981 /* Peel the first copy of loop body. Leave the exit test if the number
982 of iterations is not reliable. Also record the place of the extra zero
983 check. */
984 bitmap_clear (wont_exit);
985 if (!desc->noloop_assumptions)
986 bitmap_set_bit (wont_exit, 1);
987 ezc_swtch = loop_preheader_edge (loop)->src;
988 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
989 1, wont_exit, desc->out_edge,
990 &remove_edges,
991 DLTHE_FLAG_UPDATE_FREQ);
992 gcc_assert (ok);
995 /* Record the place where switch will be built for preconditioning. */
996 swtch = split_edge (loop_preheader_edge (loop));
998 /* Compute count increments for each switch block and initialize
999 innermost switch block. Switch blocks and peeled loop copies are built
1000 from innermost outward. */
1001 iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
1002 swtch->count = new_count;
1004 for (i = 0; i < n_peel; i++)
1006 /* Peel the copy. */
1007 bitmap_clear (wont_exit);
1008 if (i != n_peel - 1 || !last_may_exit)
1009 bitmap_set_bit (wont_exit, 1);
1010 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1011 1, wont_exit, desc->out_edge,
1012 &remove_edges,
1013 DLTHE_FLAG_UPDATE_FREQ);
1014 gcc_assert (ok);
1016 /* Create item for switch. */
1017 j = n_peel - i - (extra_zero_check ? 0 : 1);
1018 p = profile_probability::always ().apply_scale (1, i + 2);
1020 preheader = split_edge (loop_preheader_edge (loop));
1021 /* Add in count of edge from switch block. */
1022 preheader->count += iter_count;
1023 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1024 block_label (preheader), p,
1025 NULL);
1027 /* We rely on the fact that the compare and jump cannot be optimized out,
1028 and hence the cfg we create is correct. */
1029 gcc_assert (branch_code != NULL_RTX);
1031 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1032 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1033 single_succ_edge (swtch)->probability = p.invert ();
1034 new_count += iter_count;
1035 swtch->count = new_count;
1036 e = make_edge (swtch, preheader,
1037 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1038 e->probability = p;
1041 if (extra_zero_check)
1043 /* Add branch for zero iterations. */
1044 p = profile_probability::always ().apply_scale (1, max_unroll + 1);
1045 swtch = ezc_swtch;
1046 preheader = split_edge (loop_preheader_edge (loop));
1047 /* Recompute count adjustments since initial peel copy may
1048 have exited and reduced those values that were computed above. */
1049 iter_count = swtch->count.apply_scale (1, max_unroll + 1);
1050 /* Add in count of edge from switch block. */
1051 preheader->count += iter_count;
1052 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1053 block_label (preheader), p,
1054 NULL);
1055 gcc_assert (branch_code != NULL_RTX);
1057 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1058 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1059 single_succ_edge (swtch)->probability = p.invert ();
1060 e = make_edge (swtch, preheader,
1061 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1062 e->probability = p;
1065 /* Recount dominators for outer blocks. */
1066 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1068 /* And unroll loop. */
1070 bitmap_ones (wont_exit);
1071 bitmap_clear_bit (wont_exit, may_exit_copy);
1072 opt_info_start_duplication (opt_info);
1074 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1075 max_unroll,
1076 wont_exit, desc->out_edge,
1077 &remove_edges,
1078 DLTHE_FLAG_UPDATE_FREQ
1079 | (opt_info
1080 ? DLTHE_RECORD_COPY_NUMBER
1081 : 0));
1082 gcc_assert (ok);
1084 if (opt_info)
1086 apply_opt_in_copies (opt_info, max_unroll, true, true);
1087 free_opt_info (opt_info);
1090 if (exit_at_end)
1092 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1093 /* Find a new in and out edge; they are in the last copy we have
1094 made. */
1096 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1098 desc->out_edge = EDGE_SUCC (exit_block, 0);
1099 desc->in_edge = EDGE_SUCC (exit_block, 1);
1101 else
1103 desc->out_edge = EDGE_SUCC (exit_block, 1);
1104 desc->in_edge = EDGE_SUCC (exit_block, 0);
1108 /* Remove the edges. */
1109 FOR_EACH_VEC_ELT (remove_edges, i, e)
1110 remove_path (e);
1112 /* We must be careful when updating the number of iterations due to
1113 preconditioning and the fact that the value must be valid at entry
1114 of the loop. After passing through the above code, we see that
1115 the correct new number of iterations is this: */
1116 gcc_assert (!desc->const_iter);
1117 desc->niter_expr =
1118 simplify_gen_binary (UDIV, desc->mode, old_niter,
1119 gen_int_mode (max_unroll + 1, desc->mode));
1120 loop->nb_iterations_upper_bound
1121 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1122 if (loop->any_estimate)
1123 loop->nb_iterations_estimate
1124 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1125 if (loop->any_likely_upper_bound)
1126 loop->nb_iterations_likely_upper_bound
1127 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1128 if (exit_at_end)
1130 desc->niter_expr =
1131 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1132 desc->noloop_assumptions = NULL_RTX;
1133 --loop->nb_iterations_upper_bound;
1134 if (loop->any_estimate
1135 && loop->nb_iterations_estimate != 0)
1136 --loop->nb_iterations_estimate;
1137 else
1138 loop->any_estimate = false;
1139 if (loop->any_likely_upper_bound
1140 && loop->nb_iterations_likely_upper_bound != 0)
1141 --loop->nb_iterations_likely_upper_bound;
1142 else
1143 loop->any_likely_upper_bound = false;
1146 if (dump_file)
1147 fprintf (dump_file,
1148 ";; Unrolled loop %d times, counting # of iterations "
1149 "in runtime, %i insns\n",
1150 max_unroll, num_loop_insns (loop));
1153 /* Decide whether to unroll LOOP stupidly and how much. */
1154 static void
1155 decide_unroll_stupid (struct loop *loop, int flags)
1157 unsigned nunroll, nunroll_by_av, i;
1158 struct niter_desc *desc;
1159 widest_int iterations;
1161 /* If we were not asked to unroll this loop, just return back silently. */
1162 if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1163 return;
1165 if (dump_enabled_p ())
1166 dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
1168 /* nunroll = total number of copies of the original loop body in
1169 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1170 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1171 nunroll_by_av
1172 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1173 if (nunroll > nunroll_by_av)
1174 nunroll = nunroll_by_av;
1175 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1176 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1178 if (targetm.loop_unroll_adjust)
1179 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1181 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1182 nunroll = loop->unroll;
1184 /* Skip big loops. */
1185 if (nunroll <= 1)
1187 if (dump_file)
1188 fprintf (dump_file, ";; Not considering loop, is too big\n");
1189 return;
1192 /* Check for simple loops. */
1193 desc = get_simple_loop_desc (loop);
1195 /* Check simpleness. */
1196 if (desc->simple_p && !desc->assumptions)
1198 if (dump_file)
1199 fprintf (dump_file, ";; Loop is simple\n");
1200 return;
1203 /* Do not unroll loops with branches inside -- it increases number
1204 of mispredicts.
1205 TODO: this heuristic needs tunning; call inside the loop body
1206 is also relatively good reason to not unroll. */
1207 if (num_loop_branches (loop) > 1)
1209 if (dump_file)
1210 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1211 return;
1214 /* Check whether the loop rolls. */
1215 if ((get_estimated_loop_iterations (loop, &iterations)
1216 || get_likely_max_loop_iterations (loop, &iterations))
1217 && wi::ltu_p (iterations, 2 * nunroll))
1219 if (dump_file)
1220 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1221 return;
1224 /* Success. Now force nunroll to be power of 2, as it seems that this
1225 improves results (partially because of better alignments, partially
1226 because of some dark magic). */
1227 for (i = 1; 2 * i <= nunroll; i *= 2)
1228 continue;
1230 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1231 loop->lpt_decision.times = i - 1;
1234 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1236 while (cond)
1237 body;
1239 ==> (LOOP->LPT_DECISION.TIMES == 3)
1241 while (cond)
1243 body;
1244 if (!cond) break;
1245 body;
1246 if (!cond) break;
1247 body;
1248 if (!cond) break;
1249 body;
1252 static void
1253 unroll_loop_stupid (struct loop *loop)
1255 unsigned nunroll = loop->lpt_decision.times;
1256 struct niter_desc *desc = get_simple_loop_desc (loop);
1257 struct opt_info *opt_info = NULL;
1258 bool ok;
1260 if (flag_split_ivs_in_unroller
1261 || flag_variable_expansion_in_unroller)
1262 opt_info = analyze_insns_in_loop (loop);
1264 auto_sbitmap wont_exit (nunroll + 1);
1265 bitmap_clear (wont_exit);
1266 opt_info_start_duplication (opt_info);
1268 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1269 nunroll, wont_exit,
1270 NULL, NULL,
1271 DLTHE_FLAG_UPDATE_FREQ
1272 | (opt_info
1273 ? DLTHE_RECORD_COPY_NUMBER
1274 : 0));
1275 gcc_assert (ok);
1277 if (opt_info)
1279 apply_opt_in_copies (opt_info, nunroll, true, true);
1280 free_opt_info (opt_info);
1283 if (desc->simple_p)
1285 /* We indeed may get here provided that there are nontrivial assumptions
1286 for a loop to be really simple. We could update the counts, but the
1287 problem is that we are unable to decide which exit will be taken
1288 (not really true in case the number of iterations is constant,
1289 but no one will do anything with this information, so we do not
1290 worry about it). */
1291 desc->simple_p = false;
1294 if (dump_file)
1295 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1296 nunroll, num_loop_insns (loop));
1299 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1300 Set *DEBUG_USES to the number of debug insns that reference the
1301 variable. */
1303 static bool
1304 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1305 int *debug_uses)
1307 basic_block *body, bb;
1308 unsigned i;
1309 int count_ref = 0;
1310 rtx_insn *insn;
1312 body = get_loop_body (loop);
1313 for (i = 0; i < loop->num_nodes; i++)
1315 bb = body[i];
1317 FOR_BB_INSNS (bb, insn)
1318 if (!rtx_referenced_p (reg, insn))
1319 continue;
1320 else if (DEBUG_INSN_P (insn))
1321 ++*debug_uses;
1322 else if (++count_ref > 1)
1323 break;
1325 free (body);
1326 return (count_ref == 1);
1329 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1331 static void
1332 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1334 basic_block *body, bb;
1335 unsigned i;
1336 rtx_insn *insn;
1338 body = get_loop_body (loop);
1339 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1341 bb = body[i];
1343 FOR_BB_INSNS (bb, insn)
1344 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1345 continue;
1346 else
1348 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1349 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1350 if (!--debug_uses)
1351 break;
1354 free (body);
1357 /* Determine whether INSN contains an accumulator
1358 which can be expanded into separate copies,
1359 one for each copy of the LOOP body.
1361 for (i = 0 ; i < n; i++)
1362 sum += a[i];
1366 sum += a[i]
1367 ....
1368 i = i+1;
1369 sum1 += a[i]
1370 ....
1371 i = i+1
1372 sum2 += a[i];
1373 ....
1375 Return NULL if INSN contains no opportunity for expansion of accumulator.
1376 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1377 information and return a pointer to it.
1380 static struct var_to_expand *
1381 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1383 rtx set, dest, src;
1384 struct var_to_expand *ves;
1385 unsigned accum_pos;
1386 enum rtx_code code;
1387 int debug_uses = 0;
1389 set = single_set (insn);
1390 if (!set)
1391 return NULL;
1393 dest = SET_DEST (set);
1394 src = SET_SRC (set);
1395 code = GET_CODE (src);
1397 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1398 return NULL;
1400 if (FLOAT_MODE_P (GET_MODE (dest)))
1402 if (!flag_associative_math)
1403 return NULL;
1404 /* In the case of FMA, we're also changing the rounding. */
1405 if (code == FMA && !flag_unsafe_math_optimizations)
1406 return NULL;
1409 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1410 in MD. But if there is no optab to generate the insn, we can not
1411 perform the variable expansion. This can happen if an MD provides
1412 an insn but not a named pattern to generate it, for example to avoid
1413 producing code that needs additional mode switches like for x87/mmx.
1415 So we check have_insn_for which looks for an optab for the operation
1416 in SRC. If it doesn't exist, we can't perform the expansion even
1417 though INSN is valid. */
1418 if (!have_insn_for (code, GET_MODE (src)))
1419 return NULL;
1421 if (!REG_P (dest)
1422 && !(GET_CODE (dest) == SUBREG
1423 && REG_P (SUBREG_REG (dest))))
1424 return NULL;
1426 /* Find the accumulator use within the operation. */
1427 if (code == FMA)
1429 /* We only support accumulation via FMA in the ADD position. */
1430 if (!rtx_equal_p (dest, XEXP (src, 2)))
1431 return NULL;
1432 accum_pos = 2;
1434 else if (rtx_equal_p (dest, XEXP (src, 0)))
1435 accum_pos = 0;
1436 else if (rtx_equal_p (dest, XEXP (src, 1)))
1438 /* The method of expansion that we are using; which includes the
1439 initialization of the expansions with zero and the summation of
1440 the expansions at the end of the computation will yield wrong
1441 results for (x = something - x) thus avoid using it in that case. */
1442 if (code == MINUS)
1443 return NULL;
1444 accum_pos = 1;
1446 else
1447 return NULL;
1449 /* It must not otherwise be used. */
1450 if (code == FMA)
1452 if (rtx_referenced_p (dest, XEXP (src, 0))
1453 || rtx_referenced_p (dest, XEXP (src, 1)))
1454 return NULL;
1456 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1457 return NULL;
1459 /* It must be used in exactly one insn. */
1460 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1461 return NULL;
1463 if (dump_file)
1465 fprintf (dump_file, "\n;; Expanding Accumulator ");
1466 print_rtl (dump_file, dest);
1467 fprintf (dump_file, "\n");
1470 if (debug_uses)
1471 /* Instead of resetting the debug insns, we could replace each
1472 debug use in the loop with the sum or product of all expanded
1473 accumulators. Since we'll only know of all expansions at the
1474 end, we'd have to keep track of which vars_to_expand a debug
1475 insn in the loop references, take note of each copy of the
1476 debug insn during unrolling, and when it's all done, compute
1477 the sum or product of each variable and adjust the original
1478 debug insn and each copy thereof. What a pain! */
1479 reset_debug_uses_in_loop (loop, dest, debug_uses);
1481 /* Record the accumulator to expand. */
1482 ves = XNEW (struct var_to_expand);
1483 ves->insn = insn;
1484 ves->reg = copy_rtx (dest);
1485 ves->var_expansions.create (1);
1486 ves->next = NULL;
1487 ves->op = GET_CODE (src);
1488 ves->expansion_count = 0;
1489 ves->reuse_expansion = 0;
1490 return ves;
1493 /* Determine whether there is an induction variable in INSN that
1494 we would like to split during unrolling.
1496 I.e. replace
1498 i = i + 1;
1500 i = i + 1;
1502 i = i + 1;
1505 type chains by
1507 i0 = i + 1
1509 i = i0 + 1
1511 i = i0 + 2
1514 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1515 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1516 pointer to it. */
1518 static struct iv_to_split *
1519 analyze_iv_to_split_insn (rtx_insn *insn)
1521 rtx set, dest;
1522 struct rtx_iv iv;
1523 struct iv_to_split *ivts;
1524 scalar_int_mode mode;
1525 bool ok;
1527 /* For now we just split the basic induction variables. Later this may be
1528 extended for example by selecting also addresses of memory references. */
1529 set = single_set (insn);
1530 if (!set)
1531 return NULL;
1533 dest = SET_DEST (set);
1534 if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
1535 return NULL;
1537 if (!biv_p (insn, mode, dest))
1538 return NULL;
1540 ok = iv_analyze_result (insn, dest, &iv);
1542 /* This used to be an assert under the assumption that if biv_p returns
1543 true that iv_analyze_result must also return true. However, that
1544 assumption is not strictly correct as evidenced by pr25569.
1546 Returning NULL when iv_analyze_result returns false is safe and
1547 avoids the problems in pr25569 until the iv_analyze_* routines
1548 can be fixed, which is apparently hard and time consuming
1549 according to their author. */
1550 if (! ok)
1551 return NULL;
1553 if (iv.step == const0_rtx
1554 || iv.mode != iv.extend_mode)
1555 return NULL;
1557 /* Record the insn to split. */
1558 ivts = XNEW (struct iv_to_split);
1559 ivts->insn = insn;
1560 ivts->orig_var = dest;
1561 ivts->base_var = NULL_RTX;
1562 ivts->step = iv.step;
1563 ivts->next = NULL;
1565 return ivts;
1568 /* Determines which of insns in LOOP can be optimized.
1569 Return a OPT_INFO struct with the relevant hash tables filled
1570 with all insns to be optimized. The FIRST_NEW_BLOCK field
1571 is undefined for the return value. */
1573 static struct opt_info *
1574 analyze_insns_in_loop (struct loop *loop)
1576 basic_block *body, bb;
1577 unsigned i;
1578 struct opt_info *opt_info = XCNEW (struct opt_info);
1579 rtx_insn *insn;
1580 struct iv_to_split *ivts = NULL;
1581 struct var_to_expand *ves = NULL;
1582 iv_to_split **slot1;
1583 var_to_expand **slot2;
1584 vec<edge> edges = get_loop_exit_edges (loop);
1585 edge exit;
1586 bool can_apply = false;
1588 iv_analysis_loop_init (loop);
1590 body = get_loop_body (loop);
1592 if (flag_split_ivs_in_unroller)
1594 opt_info->insns_to_split
1595 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1596 opt_info->iv_to_split_head = NULL;
1597 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1600 /* Record the loop exit bb and loop preheader before the unrolling. */
1601 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1603 if (edges.length () == 1)
1605 exit = edges[0];
1606 if (!(exit->flags & EDGE_COMPLEX))
1608 opt_info->loop_exit = split_edge (exit);
1609 can_apply = true;
1613 if (flag_variable_expansion_in_unroller
1614 && can_apply)
1616 opt_info->insns_with_var_to_expand
1617 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1618 opt_info->var_to_expand_head = NULL;
1619 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1622 for (i = 0; i < loop->num_nodes; i++)
1624 bb = body[i];
1625 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1626 continue;
1628 FOR_BB_INSNS (bb, insn)
1630 if (!INSN_P (insn))
1631 continue;
1633 if (opt_info->insns_to_split)
1634 ivts = analyze_iv_to_split_insn (insn);
1636 if (ivts)
1638 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1639 gcc_assert (*slot1 == NULL);
1640 *slot1 = ivts;
1641 *opt_info->iv_to_split_tail = ivts;
1642 opt_info->iv_to_split_tail = &ivts->next;
1643 continue;
1646 if (opt_info->insns_with_var_to_expand)
1647 ves = analyze_insn_to_expand_var (loop, insn);
1649 if (ves)
1651 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1652 gcc_assert (*slot2 == NULL);
1653 *slot2 = ves;
1654 *opt_info->var_to_expand_tail = ves;
1655 opt_info->var_to_expand_tail = &ves->next;
1660 edges.release ();
1661 free (body);
1662 return opt_info;
1665 /* Called just before loop duplication. Records start of duplicated area
1666 to OPT_INFO. */
1668 static void
1669 opt_info_start_duplication (struct opt_info *opt_info)
1671 if (opt_info)
1672 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1675 /* Determine the number of iterations between initialization of the base
1676 variable and the current copy (N_COPY). N_COPIES is the total number
1677 of newly created copies. UNROLLING is true if we are unrolling
1678 (not peeling) the loop. */
1680 static unsigned
1681 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1683 if (unrolling)
1685 /* If we are unrolling, initialization is done in the original loop
1686 body (number 0). */
1687 return n_copy;
1689 else
1691 /* If we are peeling, the copy in that the initialization occurs has
1692 number 1. The original loop (number 0) is the last. */
1693 if (n_copy)
1694 return n_copy - 1;
1695 else
1696 return n_copies;
1700 /* Allocate basic variable for the induction variable chain. */
1702 static void
1703 allocate_basic_variable (struct iv_to_split *ivts)
1705 rtx expr = SET_SRC (single_set (ivts->insn));
1707 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1710 /* Insert initialization of basic variable of IVTS before INSN, taking
1711 the initial value from INSN. */
1713 static void
1714 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1716 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1717 rtx_insn *seq;
1719 start_sequence ();
1720 expr = force_operand (expr, ivts->base_var);
1721 if (expr != ivts->base_var)
1722 emit_move_insn (ivts->base_var, expr);
1723 seq = get_insns ();
1724 end_sequence ();
1726 emit_insn_before (seq, insn);
1729 /* Replace the use of induction variable described in IVTS in INSN
1730 by base variable + DELTA * step. */
1732 static void
1733 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1735 rtx expr, *loc, incr, var;
1736 rtx_insn *seq;
1737 machine_mode mode = GET_MODE (ivts->base_var);
1738 rtx src, dest, set;
1740 /* Construct base + DELTA * step. */
1741 if (!delta)
1742 expr = ivts->base_var;
1743 else
1745 incr = simplify_gen_binary (MULT, mode,
1746 copy_rtx (ivts->step),
1747 gen_int_mode (delta, mode));
1748 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1749 ivts->base_var, incr);
1752 /* Figure out where to do the replacement. */
1753 loc = &SET_SRC (single_set (insn));
1755 /* If we can make the replacement right away, we're done. */
1756 if (validate_change (insn, loc, expr, 0))
1757 return;
1759 /* Otherwise, force EXPR into a register and try again. */
1760 start_sequence ();
1761 var = gen_reg_rtx (mode);
1762 expr = force_operand (expr, var);
1763 if (expr != var)
1764 emit_move_insn (var, expr);
1765 seq = get_insns ();
1766 end_sequence ();
1767 emit_insn_before (seq, insn);
1769 if (validate_change (insn, loc, var, 0))
1770 return;
1772 /* The last chance. Try recreating the assignment in insn
1773 completely from scratch. */
1774 set = single_set (insn);
1775 gcc_assert (set);
1777 start_sequence ();
1778 *loc = var;
1779 src = copy_rtx (SET_SRC (set));
1780 dest = copy_rtx (SET_DEST (set));
1781 src = force_operand (src, dest);
1782 if (src != dest)
1783 emit_move_insn (dest, src);
1784 seq = get_insns ();
1785 end_sequence ();
1787 emit_insn_before (seq, insn);
1788 delete_insn (insn);
1792 /* Return one expansion of the accumulator recorded in struct VE. */
1794 static rtx
1795 get_expansion (struct var_to_expand *ve)
1797 rtx reg;
1799 if (ve->reuse_expansion == 0)
1800 reg = ve->reg;
1801 else
1802 reg = ve->var_expansions[ve->reuse_expansion - 1];
1804 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1805 ve->reuse_expansion = 0;
1806 else
1807 ve->reuse_expansion++;
1809 return reg;
1813 /* Given INSN replace the uses of the accumulator recorded in VE
1814 with a new register. */
1816 static void
1817 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1819 rtx new_reg, set;
1820 bool really_new_expansion = false;
1822 set = single_set (insn);
1823 gcc_assert (set);
1825 /* Generate a new register only if the expansion limit has not been
1826 reached. Else reuse an already existing expansion. */
1827 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1829 really_new_expansion = true;
1830 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1832 else
1833 new_reg = get_expansion (ve);
1835 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1836 if (apply_change_group ())
1837 if (really_new_expansion)
1839 ve->var_expansions.safe_push (new_reg);
1840 ve->expansion_count++;
1844 /* Initialize the variable expansions in loop preheader. PLACE is the
1845 loop-preheader basic block where the initialization of the
1846 expansions should take place. The expansions are initialized with
1847 (-0) when the operation is plus or minus to honor sign zero. This
1848 way we can prevent cases where the sign of the final result is
1849 effected by the sign of the expansion. Here is an example to
1850 demonstrate this:
1852 for (i = 0 ; i < n; i++)
1853 sum += something;
1857 sum += something
1858 ....
1859 i = i+1;
1860 sum1 += something
1861 ....
1862 i = i+1
1863 sum2 += something;
1864 ....
1866 When SUM is initialized with -zero and SOMETHING is also -zero; the
1867 final result of sum should be -zero thus the expansions sum1 and sum2
1868 should be initialized with -zero as well (otherwise we will get +zero
1869 as the final result). */
1871 static void
1872 insert_var_expansion_initialization (struct var_to_expand *ve,
1873 basic_block place)
1875 rtx_insn *seq;
1876 rtx var, zero_init;
1877 unsigned i;
1878 machine_mode mode = GET_MODE (ve->reg);
1879 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1881 if (ve->var_expansions.length () == 0)
1882 return;
1884 start_sequence ();
1885 switch (ve->op)
1887 case FMA:
1888 /* Note that we only accumulate FMA via the ADD operand. */
1889 case PLUS:
1890 case MINUS:
1891 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1893 if (honor_signed_zero_p)
1894 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1895 else
1896 zero_init = CONST0_RTX (mode);
1897 emit_move_insn (var, zero_init);
1899 break;
1901 case MULT:
1902 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1904 zero_init = CONST1_RTX (GET_MODE (var));
1905 emit_move_insn (var, zero_init);
1907 break;
1909 default:
1910 gcc_unreachable ();
1913 seq = get_insns ();
1914 end_sequence ();
1916 emit_insn_after (seq, BB_END (place));
1919 /* Combine the variable expansions at the loop exit. PLACE is the
1920 loop exit basic block where the summation of the expansions should
1921 take place. */
1923 static void
1924 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1926 rtx sum = ve->reg;
1927 rtx expr, var;
1928 rtx_insn *seq, *insn;
1929 unsigned i;
1931 if (ve->var_expansions.length () == 0)
1932 return;
1934 /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1935 it both here and as the destination of the assignment. */
1936 sum = copy_rtx (sum);
1937 start_sequence ();
1938 switch (ve->op)
1940 case FMA:
1941 /* Note that we only accumulate FMA via the ADD operand. */
1942 case PLUS:
1943 case MINUS:
1944 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1945 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1946 break;
1948 case MULT:
1949 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1950 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1951 break;
1953 default:
1954 gcc_unreachable ();
1957 expr = force_operand (sum, ve->reg);
1958 if (expr != ve->reg)
1959 emit_move_insn (ve->reg, expr);
1960 seq = get_insns ();
1961 end_sequence ();
1963 insn = BB_HEAD (place);
1964 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1965 insn = NEXT_INSN (insn);
1967 emit_insn_after (seq, insn);
1970 /* Strip away REG_EQUAL notes for IVs we're splitting.
1972 Updating REG_EQUAL notes for IVs we split is tricky: We
1973 cannot tell until after unrolling, DF-rescanning, and liveness
1974 updating, whether an EQ_USE is reached by the split IV while
1975 the IV reg is still live. See PR55006.
1977 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1978 because RTL loop-iv requires us to defer rescanning insns and
1979 any notes attached to them. So resort to old techniques... */
1981 static void
1982 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1984 struct iv_to_split *ivts;
1985 rtx note = find_reg_equal_equiv_note (insn);
1986 if (! note)
1987 return;
1988 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1989 if (reg_mentioned_p (ivts->orig_var, note))
1991 remove_note (insn, note);
1992 return;
1996 /* Apply loop optimizations in loop copies using the
1997 data which gathered during the unrolling. Structure
1998 OPT_INFO record that data.
2000 UNROLLING is true if we unrolled (not peeled) the loop.
2001 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2002 the loop (as it should happen in complete unrolling, but not in ordinary
2003 peeling of the loop). */
2005 static void
2006 apply_opt_in_copies (struct opt_info *opt_info,
2007 unsigned n_copies, bool unrolling,
2008 bool rewrite_original_loop)
2010 unsigned i, delta;
2011 basic_block bb, orig_bb;
2012 rtx_insn *insn, *orig_insn, *next;
2013 struct iv_to_split ivts_templ, *ivts;
2014 struct var_to_expand ve_templ, *ves;
2016 /* Sanity check -- we need to put initialization in the original loop
2017 body. */
2018 gcc_assert (!unrolling || rewrite_original_loop);
2020 /* Allocate the basic variables (i0). */
2021 if (opt_info->insns_to_split)
2022 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2023 allocate_basic_variable (ivts);
2025 for (i = opt_info->first_new_block;
2026 i < (unsigned) last_basic_block_for_fn (cfun);
2027 i++)
2029 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2030 orig_bb = get_bb_original (bb);
2032 /* bb->aux holds position in copy sequence initialized by
2033 duplicate_loop_to_header_edge. */
2034 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2035 unrolling);
2036 bb->aux = 0;
2037 orig_insn = BB_HEAD (orig_bb);
2038 FOR_BB_INSNS_SAFE (bb, insn, next)
2040 if (!INSN_P (insn)
2041 || (DEBUG_BIND_INSN_P (insn)
2042 && INSN_VAR_LOCATION_DECL (insn)
2043 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2044 continue;
2046 while (!INSN_P (orig_insn)
2047 || (DEBUG_BIND_INSN_P (orig_insn)
2048 && INSN_VAR_LOCATION_DECL (orig_insn)
2049 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2050 == LABEL_DECL)))
2051 orig_insn = NEXT_INSN (orig_insn);
2053 ivts_templ.insn = orig_insn;
2054 ve_templ.insn = orig_insn;
2056 /* Apply splitting iv optimization. */
2057 if (opt_info->insns_to_split)
2059 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2061 ivts = opt_info->insns_to_split->find (&ivts_templ);
2063 if (ivts)
2065 gcc_assert (GET_CODE (PATTERN (insn))
2066 == GET_CODE (PATTERN (orig_insn)));
2068 if (!delta)
2069 insert_base_initialization (ivts, insn);
2070 split_iv (ivts, insn, delta);
2073 /* Apply variable expansion optimization. */
2074 if (unrolling && opt_info->insns_with_var_to_expand)
2076 ves = (struct var_to_expand *)
2077 opt_info->insns_with_var_to_expand->find (&ve_templ);
2078 if (ves)
2080 gcc_assert (GET_CODE (PATTERN (insn))
2081 == GET_CODE (PATTERN (orig_insn)));
2082 expand_var_during_unrolling (ves, insn);
2085 orig_insn = NEXT_INSN (orig_insn);
2089 if (!rewrite_original_loop)
2090 return;
2092 /* Initialize the variable expansions in the loop preheader
2093 and take care of combining them at the loop exit. */
2094 if (opt_info->insns_with_var_to_expand)
2096 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2097 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2098 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2099 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2102 /* Rewrite also the original loop body. Find them as originals of the blocks
2103 in the last copied iteration, i.e. those that have
2104 get_bb_copy (get_bb_original (bb)) == bb. */
2105 for (i = opt_info->first_new_block;
2106 i < (unsigned) last_basic_block_for_fn (cfun);
2107 i++)
2109 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2110 orig_bb = get_bb_original (bb);
2111 if (get_bb_copy (orig_bb) != bb)
2112 continue;
2114 delta = determine_split_iv_delta (0, n_copies, unrolling);
2115 for (orig_insn = BB_HEAD (orig_bb);
2116 orig_insn != NEXT_INSN (BB_END (bb));
2117 orig_insn = next)
2119 next = NEXT_INSN (orig_insn);
2121 if (!INSN_P (orig_insn))
2122 continue;
2124 ivts_templ.insn = orig_insn;
2125 if (opt_info->insns_to_split)
2127 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2129 ivts = (struct iv_to_split *)
2130 opt_info->insns_to_split->find (&ivts_templ);
2131 if (ivts)
2133 if (!delta)
2134 insert_base_initialization (ivts, orig_insn);
2135 split_iv (ivts, orig_insn, delta);
2136 continue;
2144 /* Release OPT_INFO. */
2146 static void
2147 free_opt_info (struct opt_info *opt_info)
2149 delete opt_info->insns_to_split;
2150 opt_info->insns_to_split = NULL;
2151 if (opt_info->insns_with_var_to_expand)
2153 struct var_to_expand *ves;
2155 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2156 ves->var_expansions.release ();
2157 delete opt_info->insns_with_var_to_expand;
2158 opt_info->insns_with_var_to_expand = NULL;
2160 free (opt_info);