Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / loop-unroll.c
blob2b31fafa3a389dea3fb6de966a34b40509c5230a
1 /* Loop unrolling.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "memmodel.h"
29 #include "optabs.h"
30 #include "emit-rtl.h"
31 #include "recog.h"
32 #include "profile.h"
33 #include "cfgrtl.h"
34 #include "cfgloop.h"
35 #include "dojump.h"
36 #include "expr.h"
37 #include "dumpfile.h"
39 /* This pass performs loop unrolling. We only perform this
40 optimization on innermost loops (with single exception) because
41 the impact on performance is greatest here, and we want to avoid
42 unnecessary code size growth. The gain is caused by greater sequentiality
43 of code, better code to optimize for further passes and in some cases
44 by fewer testings of exit conditions. The main problem is code growth,
45 that impacts performance negatively due to effect of caches.
47 What we do:
49 -- unrolling of loops that roll constant times; this is almost always
50 win, as we get rid of exit condition tests.
51 -- unrolling of loops that roll number of times that we can compute
52 in runtime; we also get rid of exit condition tests here, but there
53 is the extra expense for calculating the number of iterations
54 -- simple unrolling of remaining loops; this is performed only if we
55 are asked to, as the gain is questionable in this case and often
56 it may even slow down the code
57 For more detailed descriptions of each of those, see comments at
58 appropriate function below.
60 There is a lot of parameters (defined and described in params.def) that
61 control how much we unroll.
63 ??? A great problem is that we don't have a good way how to determine
64 how many times we should unroll the loop; the experiments I have made
65 showed that this choice may affect performance in order of several %.
68 /* Information about induction variables to split. */
70 struct iv_to_split
72 rtx_insn *insn; /* The insn in that the induction variable occurs. */
73 rtx orig_var; /* The variable (register) for the IV before split. */
74 rtx base_var; /* The variable on that the values in the further
75 iterations are based. */
76 rtx step; /* Step of the induction variable. */
77 struct iv_to_split *next; /* Next entry in walking order. */
80 /* Information about accumulators to expand. */
82 struct var_to_expand
84 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
85 rtx reg; /* The accumulator which is expanded. */
86 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
87 struct var_to_expand *next; /* Next entry in walking order. */
88 enum rtx_code op; /* The type of the accumulation - addition, subtraction
89 or multiplication. */
90 int expansion_count; /* Count the number of expansions generated so far. */
91 int reuse_expansion; /* The expansion we intend to reuse to expand
92 the accumulator. If REUSE_EXPANSION is 0 reuse
93 the original accumulator. Else use
94 var_expansions[REUSE_EXPANSION - 1]. */
97 /* Hashtable helper for iv_to_split. */
99 struct iv_split_hasher : free_ptr_hash <iv_to_split>
101 static inline hashval_t hash (const iv_to_split *);
102 static inline bool equal (const iv_to_split *, const iv_to_split *);
106 /* A hash function for information about insns to split. */
108 inline hashval_t
109 iv_split_hasher::hash (const iv_to_split *ivts)
111 return (hashval_t) INSN_UID (ivts->insn);
114 /* An equality functions for information about insns to split. */
116 inline bool
117 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
119 return i1->insn == i2->insn;
122 /* Hashtable helper for iv_to_split. */
124 struct var_expand_hasher : free_ptr_hash <var_to_expand>
126 static inline hashval_t hash (const var_to_expand *);
127 static inline bool equal (const var_to_expand *, const var_to_expand *);
130 /* Return a hash for VES. */
132 inline hashval_t
133 var_expand_hasher::hash (const var_to_expand *ves)
135 return (hashval_t) INSN_UID (ves->insn);
138 /* Return true if I1 and I2 refer to the same instruction. */
140 inline bool
141 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
143 return i1->insn == i2->insn;
146 /* Information about optimization applied in
147 the unrolled loop. */
149 struct opt_info
151 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
152 split. */
153 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
154 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
155 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
156 insns with accumulators to expand. */
157 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
158 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
159 unsigned first_new_block; /* The first basic block that was
160 duplicated. */
161 basic_block loop_exit; /* The loop exit basic block. */
162 basic_block loop_preheader; /* The loop preheader basic block. */
165 static void decide_unroll_stupid (class loop *, int);
166 static void decide_unroll_constant_iterations (class loop *, int);
167 static void decide_unroll_runtime_iterations (class loop *, int);
168 static void unroll_loop_stupid (class loop *);
169 static void decide_unrolling (int);
170 static void unroll_loop_constant_iterations (class loop *);
171 static void unroll_loop_runtime_iterations (class loop *);
172 static struct opt_info *analyze_insns_in_loop (class loop *);
173 static void opt_info_start_duplication (struct opt_info *);
174 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
175 static void free_opt_info (struct opt_info *);
176 static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn *);
177 static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
178 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
179 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
180 static void insert_var_expansion_initialization (struct var_to_expand *,
181 basic_block);
182 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
183 basic_block);
184 static rtx get_expansion (struct var_to_expand *);
186 /* Emit a message summarizing the unroll that will be
187 performed for LOOP, along with the loop's location LOCUS, if
188 appropriate given the dump or -fopt-info settings. */
190 static void
191 report_unroll (class loop *loop, dump_location_t locus)
193 dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
195 if (loop->lpt_decision.decision == LPT_NONE)
196 return;
198 if (!dump_enabled_p ())
199 return;
201 dump_metadata_t metadata (report_flags, locus.get_impl_location ());
202 dump_printf_loc (metadata, locus.get_user_location (),
203 "loop unrolled %d times",
204 loop->lpt_decision.times);
205 if (profile_info && loop->header->count.initialized_p ())
206 dump_printf (metadata,
207 " (header execution count %d)",
208 (int)loop->header->count.to_gcov_type ());
210 dump_printf (metadata, "\n");
213 /* Decide whether unroll loops and how much. */
214 static void
215 decide_unrolling (int flags)
217 /* Scan the loops, inner ones first. */
218 for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
220 loop->lpt_decision.decision = LPT_NONE;
221 dump_user_location_t locus = get_loop_location (loop);
223 if (dump_enabled_p ())
224 dump_printf_loc (MSG_NOTE, locus,
225 "considering unrolling loop %d at BB %d\n",
226 loop->num, loop->header->index);
228 if (loop->unroll == 1)
230 if (dump_file)
231 fprintf (dump_file,
232 ";; Not unrolling loop, user didn't want it unrolled\n");
233 continue;
236 /* Do not peel cold areas. */
237 if (optimize_loop_for_size_p (loop))
239 if (dump_file)
240 fprintf (dump_file, ";; Not considering loop, cold area\n");
241 continue;
244 /* Can the loop be manipulated? */
245 if (!can_duplicate_loop_p (loop))
247 if (dump_file)
248 fprintf (dump_file,
249 ";; Not considering loop, cannot duplicate\n");
250 continue;
253 /* Skip non-innermost loops. */
254 if (loop->inner)
256 if (dump_file)
257 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
258 continue;
261 loop->ninsns = num_loop_insns (loop);
262 loop->av_ninsns = average_num_loop_insns (loop);
264 /* Try transformations one by one in decreasing order of priority. */
265 decide_unroll_constant_iterations (loop, flags);
266 if (loop->lpt_decision.decision == LPT_NONE)
267 decide_unroll_runtime_iterations (loop, flags);
268 if (loop->lpt_decision.decision == LPT_NONE)
269 decide_unroll_stupid (loop, flags);
271 report_unroll (loop, locus);
275 /* Unroll LOOPS. */
276 void
277 unroll_loops (int flags)
279 bool changed = false;
281 /* Now decide rest of unrolling. */
282 decide_unrolling (flags);
284 /* Scan the loops, inner ones first. */
285 for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
287 /* And perform the appropriate transformations. */
288 switch (loop->lpt_decision.decision)
290 case LPT_UNROLL_CONSTANT:
291 unroll_loop_constant_iterations (loop);
292 changed = true;
293 break;
294 case LPT_UNROLL_RUNTIME:
295 unroll_loop_runtime_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_STUPID:
299 unroll_loop_stupid (loop);
300 changed = true;
301 break;
302 case LPT_NONE:
303 break;
304 default:
305 gcc_unreachable ();
309 if (changed)
311 calculate_dominance_info (CDI_DOMINATORS);
312 fix_loop_structure (NULL);
315 iv_analysis_done ();
318 /* Check whether exit of the LOOP is at the end of loop body. */
320 static bool
321 loop_exit_at_end_p (class loop *loop)
323 class niter_desc *desc = get_simple_loop_desc (loop);
324 rtx_insn *insn;
326 /* We should never have conditional in latch block. */
327 gcc_assert (desc->in_edge->dest != loop->header);
329 if (desc->in_edge->dest != loop->latch)
330 return false;
332 /* Check that the latch is empty. */
333 FOR_BB_INSNS (loop->latch, insn)
335 if (INSN_P (insn) && active_insn_p (insn))
336 return false;
339 return true;
342 /* Decide whether to unroll LOOP iterating constant number of times
343 and how much. */
345 static void
346 decide_unroll_constant_iterations (class loop *loop, int flags)
348 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
349 class niter_desc *desc;
350 widest_int iterations;
352 /* If we were not asked to unroll this loop, just return back silently. */
353 if (!(flags & UAP_UNROLL) && !loop->unroll)
354 return;
356 if (dump_enabled_p ())
357 dump_printf (MSG_NOTE,
358 "considering unrolling loop with constant "
359 "number of iterations\n");
361 /* nunroll = total number of copies of the original loop body in
362 unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
363 nunroll = param_max_unrolled_insns / loop->ninsns;
364 nunroll_by_av
365 = param_max_average_unrolled_insns / loop->av_ninsns;
366 if (nunroll > nunroll_by_av)
367 nunroll = nunroll_by_av;
368 if (nunroll > (unsigned) param_max_unroll_times)
369 nunroll = param_max_unroll_times;
371 if (targetm.loop_unroll_adjust)
372 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
374 /* Skip big loops. */
375 if (nunroll <= 1)
377 if (dump_file)
378 fprintf (dump_file, ";; Not considering loop, is too big\n");
379 return;
382 /* Check for simple loops. */
383 desc = get_simple_loop_desc (loop);
385 /* Check number of iterations. */
386 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
388 if (dump_file)
389 fprintf (dump_file,
390 ";; Unable to prove that the loop iterates constant times\n");
391 return;
394 /* Check for an explicit unrolling factor. */
395 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
397 /* However we cannot unroll completely at the RTL level a loop with
398 constant number of iterations; it should have been peeled instead. */
399 if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
401 if (dump_file)
402 fprintf (dump_file, ";; Loop should have been peeled\n");
404 else
406 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
407 loop->lpt_decision.times = loop->unroll - 1;
409 return;
412 /* Check whether the loop rolls enough to consider.
413 Consult also loop bounds and profile; in the case the loop has more
414 than one exit it may well loop less than determined maximal number
415 of iterations. */
416 if (desc->niter < 2 * nunroll
417 || ((get_estimated_loop_iterations (loop, &iterations)
418 || get_likely_max_loop_iterations (loop, &iterations))
419 && wi::ltu_p (iterations, 2 * nunroll)))
421 if (dump_file)
422 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
423 return;
426 /* Success; now compute number of iterations to unroll. We alter
427 nunroll so that as few as possible copies of loop body are
428 necessary, while still not decreasing the number of unrollings
429 too much (at most by 1). */
430 best_copies = 2 * nunroll + 10;
432 i = 2 * nunroll + 2;
433 if (i > desc->niter - 2)
434 i = desc->niter - 2;
436 for (; i >= nunroll - 1; i--)
438 unsigned exit_mod = desc->niter % (i + 1);
440 if (!loop_exit_at_end_p (loop))
441 n_copies = exit_mod + i + 1;
442 else if (exit_mod != (unsigned) i
443 || desc->noloop_assumptions != NULL_RTX)
444 n_copies = exit_mod + i + 2;
445 else
446 n_copies = i + 1;
448 if (n_copies < best_copies)
450 best_copies = n_copies;
451 best_unroll = i;
455 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
456 loop->lpt_decision.times = best_unroll;
459 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
460 The transformation does this:
462 for (i = 0; i < 102; i++)
463 body;
465 ==> (LOOP->LPT_DECISION.TIMES == 3)
467 i = 0;
468 body; i++;
469 body; i++;
470 while (i < 102)
472 body; i++;
473 body; i++;
474 body; i++;
475 body; i++;
478 static void
479 unroll_loop_constant_iterations (class loop *loop)
481 unsigned HOST_WIDE_INT niter;
482 unsigned exit_mod;
483 unsigned i;
484 edge e;
485 unsigned max_unroll = loop->lpt_decision.times;
486 class niter_desc *desc = get_simple_loop_desc (loop);
487 bool exit_at_end = loop_exit_at_end_p (loop);
488 struct opt_info *opt_info = NULL;
489 bool ok;
491 niter = desc->niter;
493 /* Should not get here (such loop should be peeled instead). */
494 gcc_assert (niter > max_unroll + 1);
496 exit_mod = niter % (max_unroll + 1);
498 auto_sbitmap wont_exit (max_unroll + 2);
499 bitmap_ones (wont_exit);
501 auto_vec<edge> remove_edges;
502 if (flag_split_ivs_in_unroller
503 || flag_variable_expansion_in_unroller)
504 opt_info = analyze_insns_in_loop (loop);
506 if (!exit_at_end)
508 /* The exit is not at the end of the loop; leave exit test
509 in the first copy, so that the loops that start with test
510 of exit condition have continuous body after unrolling. */
512 if (dump_file)
513 fprintf (dump_file, ";; Condition at beginning of loop.\n");
515 /* Peel exit_mod iterations. */
516 bitmap_clear_bit (wont_exit, 0);
517 if (desc->noloop_assumptions)
518 bitmap_clear_bit (wont_exit, 1);
520 if (exit_mod)
522 opt_info_start_duplication (opt_info);
523 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
524 exit_mod,
525 wont_exit, desc->out_edge,
526 &remove_edges,
527 DLTHE_FLAG_UPDATE_FREQ
528 | (opt_info && exit_mod > 1
529 ? DLTHE_RECORD_COPY_NUMBER
530 : 0));
531 gcc_assert (ok);
533 if (opt_info && exit_mod > 1)
534 apply_opt_in_copies (opt_info, exit_mod, false, false);
536 desc->noloop_assumptions = NULL_RTX;
537 desc->niter -= exit_mod;
538 loop->nb_iterations_upper_bound -= exit_mod;
539 if (loop->any_estimate
540 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
541 loop->nb_iterations_estimate -= exit_mod;
542 else
543 loop->any_estimate = false;
544 if (loop->any_likely_upper_bound
545 && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
546 loop->nb_iterations_likely_upper_bound -= exit_mod;
547 else
548 loop->any_likely_upper_bound = false;
551 bitmap_set_bit (wont_exit, 1);
553 else
555 /* Leave exit test in last copy, for the same reason as above if
556 the loop tests the condition at the end of loop body. */
558 if (dump_file)
559 fprintf (dump_file, ";; Condition at end of loop.\n");
561 /* We know that niter >= max_unroll + 2; so we do not need to care of
562 case when we would exit before reaching the loop. So just peel
563 exit_mod + 1 iterations. */
564 if (exit_mod != max_unroll
565 || desc->noloop_assumptions)
567 bitmap_clear_bit (wont_exit, 0);
568 if (desc->noloop_assumptions)
569 bitmap_clear_bit (wont_exit, 1);
571 opt_info_start_duplication (opt_info);
572 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
573 exit_mod + 1,
574 wont_exit, desc->out_edge,
575 &remove_edges,
576 DLTHE_FLAG_UPDATE_FREQ
577 | (opt_info && exit_mod > 0
578 ? DLTHE_RECORD_COPY_NUMBER
579 : 0));
580 gcc_assert (ok);
582 if (opt_info && exit_mod > 0)
583 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
585 desc->niter -= exit_mod + 1;
586 loop->nb_iterations_upper_bound -= exit_mod + 1;
587 if (loop->any_estimate
588 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
589 loop->nb_iterations_estimate -= exit_mod + 1;
590 else
591 loop->any_estimate = false;
592 if (loop->any_likely_upper_bound
593 && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
594 loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
595 else
596 loop->any_likely_upper_bound = false;
597 desc->noloop_assumptions = NULL_RTX;
599 bitmap_set_bit (wont_exit, 0);
600 bitmap_set_bit (wont_exit, 1);
603 bitmap_clear_bit (wont_exit, max_unroll);
606 /* Now unroll the loop. */
608 opt_info_start_duplication (opt_info);
609 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
610 max_unroll,
611 wont_exit, desc->out_edge,
612 &remove_edges,
613 DLTHE_FLAG_UPDATE_FREQ
614 | (opt_info
615 ? DLTHE_RECORD_COPY_NUMBER
616 : 0));
617 gcc_assert (ok);
619 if (opt_info)
621 apply_opt_in_copies (opt_info, max_unroll, true, true);
622 free_opt_info (opt_info);
625 if (exit_at_end)
627 basic_block exit_block = get_bb_copy (desc->in_edge->src);
628 /* Find a new in and out edge; they are in the last copy we have made. */
630 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
632 desc->out_edge = EDGE_SUCC (exit_block, 0);
633 desc->in_edge = EDGE_SUCC (exit_block, 1);
635 else
637 desc->out_edge = EDGE_SUCC (exit_block, 1);
638 desc->in_edge = EDGE_SUCC (exit_block, 0);
642 desc->niter /= max_unroll + 1;
643 loop->nb_iterations_upper_bound
644 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
645 if (loop->any_estimate)
646 loop->nb_iterations_estimate
647 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
648 if (loop->any_likely_upper_bound)
649 loop->nb_iterations_likely_upper_bound
650 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
651 desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
653 /* Remove the edges. */
654 FOR_EACH_VEC_ELT (remove_edges, i, e)
655 remove_path (e);
657 if (dump_file)
658 fprintf (dump_file,
659 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
660 max_unroll, num_loop_insns (loop));
663 /* Decide whether to unroll LOOP iterating runtime computable number of times
664 and how much. */
665 static void
666 decide_unroll_runtime_iterations (class loop *loop, int flags)
668 unsigned nunroll, nunroll_by_av, i;
669 class niter_desc *desc;
670 widest_int iterations;
672 /* If we were not asked to unroll this loop, just return back silently. */
673 if (!(flags & UAP_UNROLL) && !loop->unroll)
674 return;
676 if (dump_enabled_p ())
677 dump_printf (MSG_NOTE,
678 "considering unrolling loop with runtime-"
679 "computable number of iterations\n");
681 /* nunroll = total number of copies of the original loop body in
682 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
683 nunroll = param_max_unrolled_insns / loop->ninsns;
684 nunroll_by_av = param_max_average_unrolled_insns / loop->av_ninsns;
685 if (nunroll > nunroll_by_av)
686 nunroll = nunroll_by_av;
687 if (nunroll > (unsigned) param_max_unroll_times)
688 nunroll = param_max_unroll_times;
690 if (targetm.loop_unroll_adjust)
691 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
693 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
694 nunroll = loop->unroll;
696 /* Skip big loops. */
697 if (nunroll <= 1)
699 if (dump_file)
700 fprintf (dump_file, ";; Not considering loop, is too big\n");
701 return;
704 /* Check for simple loops. */
705 desc = get_simple_loop_desc (loop);
707 /* Check simpleness. */
708 if (!desc->simple_p || desc->assumptions)
710 if (dump_file)
711 fprintf (dump_file,
712 ";; Unable to prove that the number of iterations "
713 "can be counted in runtime\n");
714 return;
717 if (desc->const_iter)
719 if (dump_file)
720 fprintf (dump_file, ";; Loop iterates constant times\n");
721 return;
724 /* Check whether the loop rolls. */
725 if ((get_estimated_loop_iterations (loop, &iterations)
726 || get_likely_max_loop_iterations (loop, &iterations))
727 && wi::ltu_p (iterations, 2 * nunroll))
729 if (dump_file)
730 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
731 return;
734 /* Success; now force nunroll to be power of 2, as code-gen
735 requires it, we are unable to cope with overflows in
736 computation of number of iterations. */
737 for (i = 1; 2 * i <= nunroll; i *= 2)
738 continue;
740 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
741 loop->lpt_decision.times = i - 1;
744 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
745 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
746 and NULL is returned instead. */
748 basic_block
749 split_edge_and_insert (edge e, rtx_insn *insns)
751 basic_block bb;
753 if (!insns)
754 return NULL;
755 bb = split_edge (e);
756 emit_insn_after (insns, BB_END (bb));
758 /* ??? We used to assume that INSNS can contain control flow insns, and
759 that we had to try to find sub basic blocks in BB to maintain a valid
760 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
761 and call break_superblocks when going out of cfglayout mode. But it
762 turns out that this never happens; and that if it does ever happen,
763 the verify_flow_info at the end of the RTL loop passes would fail.
765 There are two reasons why we expected we could have control flow insns
766 in INSNS. The first is when a comparison has to be done in parts, and
767 the second is when the number of iterations is computed for loops with
768 the number of iterations known at runtime. In both cases, test cases
769 to get control flow in INSNS appear to be impossible to construct:
771 * If do_compare_rtx_and_jump needs several branches to do comparison
772 in a mode that needs comparison by parts, we cannot analyze the
773 number of iterations of the loop, and we never get to unrolling it.
775 * The code in expand_divmod that was suspected to cause creation of
776 branching code seems to be only accessed for signed division. The
777 divisions used by # of iterations analysis are always unsigned.
778 Problems might arise on architectures that emits branching code
779 for some operations that may appear in the unroller (especially
780 for division), but we have no such architectures.
782 Considering all this, it was decided that we should for now assume
783 that INSNS can in theory contain control flow insns, but in practice
784 it never does. So we don't handle the theoretical case, and should
785 a real failure ever show up, we have a pretty good clue for how to
786 fix it. */
788 return bb;
791 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
792 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
793 in order to create a jump. */
795 static rtx_insn *
796 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
797 rtx_code_label *label, profile_probability prob,
798 rtx_insn *cinsn)
800 rtx_insn *seq;
801 rtx_jump_insn *jump;
802 rtx cond;
803 machine_mode mode;
805 mode = GET_MODE (op0);
806 if (mode == VOIDmode)
807 mode = GET_MODE (op1);
809 start_sequence ();
810 if (GET_MODE_CLASS (mode) == MODE_CC)
812 /* A hack -- there seems to be no easy generic way how to make a
813 conditional jump from a ccmode comparison. */
814 gcc_assert (cinsn);
815 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
816 gcc_assert (GET_CODE (cond) == comp);
817 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
818 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
819 emit_jump_insn (copy_insn (PATTERN (cinsn)));
820 jump = as_a <rtx_jump_insn *> (get_last_insn ());
821 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
822 LABEL_NUSES (JUMP_LABEL (jump))++;
823 redirect_jump (jump, label, 0);
825 else
827 gcc_assert (!cinsn);
829 op0 = force_operand (op0, NULL_RTX);
830 op1 = force_operand (op1, NULL_RTX);
831 do_compare_rtx_and_jump (op0, op1, comp, 0,
832 mode, NULL_RTX, NULL, label,
833 profile_probability::uninitialized ());
834 jump = as_a <rtx_jump_insn *> (get_last_insn ());
835 jump->set_jump_target (label);
836 LABEL_NUSES (label)++;
838 if (prob.initialized_p ())
839 add_reg_br_prob_note (jump, prob);
841 seq = get_insns ();
842 end_sequence ();
844 return seq;
847 /* Unroll LOOP for which we are able to count number of iterations in
848 runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
849 power of two. The transformation does this (with some extra care
850 for case n < 0):
852 for (i = 0; i < n; i++)
853 body;
855 ==> (LOOP->LPT_DECISION.TIMES == 3)
857 i = 0;
858 mod = n % 4;
860 switch (mod)
862 case 3:
863 body; i++;
864 case 2:
865 body; i++;
866 case 1:
867 body; i++;
868 case 0: ;
871 while (i < n)
873 body; i++;
874 body; i++;
875 body; i++;
876 body; i++;
879 static void
880 unroll_loop_runtime_iterations (class loop *loop)
882 rtx old_niter, niter, tmp;
883 rtx_insn *init_code, *branch_code;
884 unsigned i;
885 profile_probability p;
886 basic_block preheader, *body, swtch, ezc_swtch = NULL;
887 int may_exit_copy;
888 profile_count iter_count, new_count;
889 unsigned n_peel;
890 edge e;
891 bool extra_zero_check, last_may_exit;
892 unsigned max_unroll = loop->lpt_decision.times;
893 class niter_desc *desc = get_simple_loop_desc (loop);
894 bool exit_at_end = loop_exit_at_end_p (loop);
895 struct opt_info *opt_info = NULL;
896 bool ok;
898 if (flag_split_ivs_in_unroller
899 || flag_variable_expansion_in_unroller)
900 opt_info = analyze_insns_in_loop (loop);
902 /* Remember blocks whose dominators will have to be updated. */
903 auto_vec<basic_block> dom_bbs;
905 body = get_loop_body (loop);
906 for (i = 0; i < loop->num_nodes; i++)
908 for (basic_block bb : get_dominated_by (CDI_DOMINATORS, body[i]))
909 if (!flow_bb_inside_loop_p (loop, bb))
910 dom_bbs.safe_push (bb);
912 free (body);
914 if (!exit_at_end)
916 /* Leave exit in first copy (for explanation why see comment in
917 unroll_loop_constant_iterations). */
918 may_exit_copy = 0;
919 n_peel = max_unroll - 1;
920 extra_zero_check = true;
921 last_may_exit = false;
923 else
925 /* Leave exit in last copy (for explanation why see comment in
926 unroll_loop_constant_iterations). */
927 may_exit_copy = max_unroll;
928 n_peel = max_unroll;
929 extra_zero_check = false;
930 last_may_exit = true;
933 /* Get expression for number of iterations. */
934 start_sequence ();
935 old_niter = niter = gen_reg_rtx (desc->mode);
936 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
937 if (tmp != niter)
938 emit_move_insn (niter, tmp);
940 /* For loops that exit at end and whose number of iterations is reliable,
941 add one to niter to account for first pass through loop body before
942 reaching exit test. */
943 if (exit_at_end && !desc->noloop_assumptions)
945 niter = expand_simple_binop (desc->mode, PLUS,
946 niter, const1_rtx,
947 NULL_RTX, 0, OPTAB_LIB_WIDEN);
948 old_niter = niter;
951 /* Count modulo by ANDing it with max_unroll; we use the fact that
952 the number of unrollings is a power of two, and thus this is correct
953 even if there is overflow in the computation. */
954 niter = expand_simple_binop (desc->mode, AND,
955 niter, gen_int_mode (max_unroll, desc->mode),
956 NULL_RTX, 0, OPTAB_LIB_WIDEN);
958 init_code = get_insns ();
959 end_sequence ();
960 unshare_all_rtl_in_chain (init_code);
962 /* Precondition the loop. */
963 split_edge_and_insert (loop_preheader_edge (loop), init_code);
965 auto_vec<edge> remove_edges;
967 auto_sbitmap wont_exit (max_unroll + 2);
969 if (extra_zero_check || desc->noloop_assumptions)
971 /* Peel the first copy of loop body. Leave the exit test if the number
972 of iterations is not reliable. Also record the place of the extra zero
973 check. */
974 bitmap_clear (wont_exit);
975 if (!desc->noloop_assumptions)
976 bitmap_set_bit (wont_exit, 1);
977 ezc_swtch = loop_preheader_edge (loop)->src;
978 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
979 1, wont_exit, desc->out_edge,
980 &remove_edges,
981 DLTHE_FLAG_UPDATE_FREQ);
982 gcc_assert (ok);
985 /* Record the place where switch will be built for preconditioning. */
986 swtch = split_edge (loop_preheader_edge (loop));
988 /* Compute count increments for each switch block and initialize
989 innermost switch block. Switch blocks and peeled loop copies are built
990 from innermost outward. */
991 iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
992 swtch->count = new_count;
994 for (i = 0; i < n_peel; i++)
996 /* Peel the copy. */
997 bitmap_clear (wont_exit);
998 if (i != n_peel - 1 || !last_may_exit)
999 bitmap_set_bit (wont_exit, 1);
1000 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1001 1, wont_exit, desc->out_edge,
1002 &remove_edges,
1003 DLTHE_FLAG_UPDATE_FREQ);
1004 gcc_assert (ok);
1006 /* Create item for switch. */
1007 unsigned j = n_peel - i - (extra_zero_check ? 0 : 1);
1008 p = profile_probability::always ().apply_scale (1, i + 2);
1010 preheader = split_edge (loop_preheader_edge (loop));
1011 /* Add in count of edge from switch block. */
1012 preheader->count += iter_count;
1013 branch_code = compare_and_jump_seq (copy_rtx (niter),
1014 gen_int_mode (j, desc->mode), EQ,
1015 block_label (preheader), p, NULL);
1017 /* We rely on the fact that the compare and jump cannot be optimized out,
1018 and hence the cfg we create is correct. */
1019 gcc_assert (branch_code != NULL_RTX);
1021 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1022 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1023 single_succ_edge (swtch)->probability = p.invert ();
1024 new_count += iter_count;
1025 swtch->count = new_count;
1026 e = make_edge (swtch, preheader,
1027 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1028 e->probability = p;
1031 if (extra_zero_check)
1033 /* Add branch for zero iterations. */
1034 p = profile_probability::always ().apply_scale (1, max_unroll + 1);
1035 swtch = ezc_swtch;
1036 preheader = split_edge (loop_preheader_edge (loop));
1037 /* Recompute count adjustments since initial peel copy may
1038 have exited and reduced those values that were computed above. */
1039 iter_count = swtch->count.apply_scale (1, max_unroll + 1);
1040 /* Add in count of edge from switch block. */
1041 preheader->count += iter_count;
1042 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1043 block_label (preheader), p,
1044 NULL);
1045 gcc_assert (branch_code != NULL_RTX);
1047 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1048 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1049 single_succ_edge (swtch)->probability = p.invert ();
1050 e = make_edge (swtch, preheader,
1051 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1052 e->probability = p;
1055 /* Recount dominators for outer blocks. */
1056 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1058 /* And unroll loop. */
1060 bitmap_ones (wont_exit);
1061 bitmap_clear_bit (wont_exit, may_exit_copy);
1062 opt_info_start_duplication (opt_info);
1064 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1065 max_unroll,
1066 wont_exit, desc->out_edge,
1067 &remove_edges,
1068 DLTHE_FLAG_UPDATE_FREQ
1069 | (opt_info
1070 ? DLTHE_RECORD_COPY_NUMBER
1071 : 0));
1072 gcc_assert (ok);
1074 if (opt_info)
1076 apply_opt_in_copies (opt_info, max_unroll, true, true);
1077 free_opt_info (opt_info);
1080 if (exit_at_end)
1082 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1083 /* Find a new in and out edge; they are in the last copy we have
1084 made. */
1086 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1088 desc->out_edge = EDGE_SUCC (exit_block, 0);
1089 desc->in_edge = EDGE_SUCC (exit_block, 1);
1091 else
1093 desc->out_edge = EDGE_SUCC (exit_block, 1);
1094 desc->in_edge = EDGE_SUCC (exit_block, 0);
1098 /* Remove the edges. */
1099 FOR_EACH_VEC_ELT (remove_edges, i, e)
1100 remove_path (e);
1102 /* We must be careful when updating the number of iterations due to
1103 preconditioning and the fact that the value must be valid at entry
1104 of the loop. After passing through the above code, we see that
1105 the correct new number of iterations is this: */
1106 gcc_assert (!desc->const_iter);
1107 desc->niter_expr =
1108 simplify_gen_binary (UDIV, desc->mode, old_niter,
1109 gen_int_mode (max_unroll + 1, desc->mode));
1110 loop->nb_iterations_upper_bound
1111 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1112 if (loop->any_estimate)
1113 loop->nb_iterations_estimate
1114 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1115 if (loop->any_likely_upper_bound)
1116 loop->nb_iterations_likely_upper_bound
1117 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1118 if (exit_at_end)
1120 desc->niter_expr =
1121 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1122 desc->noloop_assumptions = NULL_RTX;
1123 --loop->nb_iterations_upper_bound;
1124 if (loop->any_estimate
1125 && loop->nb_iterations_estimate != 0)
1126 --loop->nb_iterations_estimate;
1127 else
1128 loop->any_estimate = false;
1129 if (loop->any_likely_upper_bound
1130 && loop->nb_iterations_likely_upper_bound != 0)
1131 --loop->nb_iterations_likely_upper_bound;
1132 else
1133 loop->any_likely_upper_bound = false;
1136 if (dump_file)
1137 fprintf (dump_file,
1138 ";; Unrolled loop %d times, counting # of iterations "
1139 "in runtime, %i insns\n",
1140 max_unroll, num_loop_insns (loop));
1143 /* Decide whether to unroll LOOP stupidly and how much. */
1144 static void
1145 decide_unroll_stupid (class loop *loop, int flags)
1147 unsigned nunroll, nunroll_by_av, i;
1148 class niter_desc *desc;
1149 widest_int iterations;
1151 /* If we were not asked to unroll this loop, just return back silently. */
1152 if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1153 return;
1155 if (dump_enabled_p ())
1156 dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
1158 /* nunroll = total number of copies of the original loop body in
1159 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1160 nunroll = param_max_unrolled_insns / loop->ninsns;
1161 nunroll_by_av
1162 = param_max_average_unrolled_insns / loop->av_ninsns;
1163 if (nunroll > nunroll_by_av)
1164 nunroll = nunroll_by_av;
1165 if (nunroll > (unsigned) param_max_unroll_times)
1166 nunroll = param_max_unroll_times;
1168 if (targetm.loop_unroll_adjust)
1169 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1171 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1172 nunroll = loop->unroll;
1174 /* Skip big loops. */
1175 if (nunroll <= 1)
1177 if (dump_file)
1178 fprintf (dump_file, ";; Not considering loop, is too big\n");
1179 return;
1182 /* Check for simple loops. */
1183 desc = get_simple_loop_desc (loop);
1185 /* Check simpleness. */
1186 if (desc->simple_p && !desc->assumptions)
1188 if (dump_file)
1189 fprintf (dump_file, ";; Loop is simple\n");
1190 return;
1193 /* Do not unroll loops with branches inside -- it increases number
1194 of mispredicts.
1195 TODO: this heuristic needs tunning; call inside the loop body
1196 is also relatively good reason to not unroll. */
1197 if (num_loop_branches (loop) > 1)
1199 if (dump_file)
1200 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1201 return;
1204 /* Check whether the loop rolls. */
1205 if ((get_estimated_loop_iterations (loop, &iterations)
1206 || get_likely_max_loop_iterations (loop, &iterations))
1207 && wi::ltu_p (iterations, 2 * nunroll))
1209 if (dump_file)
1210 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1211 return;
1214 /* Success. Now force nunroll to be power of 2, as it seems that this
1215 improves results (partially because of better alignments, partially
1216 because of some dark magic). */
1217 for (i = 1; 2 * i <= nunroll; i *= 2)
1218 continue;
1220 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1221 loop->lpt_decision.times = i - 1;
1224 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1226 while (cond)
1227 body;
1229 ==> (LOOP->LPT_DECISION.TIMES == 3)
1231 while (cond)
1233 body;
1234 if (!cond) break;
1235 body;
1236 if (!cond) break;
1237 body;
1238 if (!cond) break;
1239 body;
1242 static void
1243 unroll_loop_stupid (class loop *loop)
1245 unsigned nunroll = loop->lpt_decision.times;
1246 class niter_desc *desc = get_simple_loop_desc (loop);
1247 struct opt_info *opt_info = NULL;
1248 bool ok;
1250 if (flag_split_ivs_in_unroller
1251 || flag_variable_expansion_in_unroller)
1252 opt_info = analyze_insns_in_loop (loop);
1254 auto_sbitmap wont_exit (nunroll + 1);
1255 bitmap_clear (wont_exit);
1256 opt_info_start_duplication (opt_info);
1258 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1259 nunroll, wont_exit,
1260 NULL, NULL,
1261 DLTHE_FLAG_UPDATE_FREQ
1262 | (opt_info
1263 ? DLTHE_RECORD_COPY_NUMBER
1264 : 0));
1265 gcc_assert (ok);
1267 if (opt_info)
1269 apply_opt_in_copies (opt_info, nunroll, true, true);
1270 free_opt_info (opt_info);
1273 if (desc->simple_p)
1275 /* We indeed may get here provided that there are nontrivial assumptions
1276 for a loop to be really simple. We could update the counts, but the
1277 problem is that we are unable to decide which exit will be taken
1278 (not really true in case the number of iterations is constant,
1279 but no one will do anything with this information, so we do not
1280 worry about it). */
1281 desc->simple_p = false;
1284 if (dump_file)
1285 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1286 nunroll, num_loop_insns (loop));
1289 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1290 Set *DEBUG_USES to the number of debug insns that reference the
1291 variable. */
1293 static bool
1294 referenced_in_one_insn_in_loop_p (class loop *loop, rtx reg,
1295 int *debug_uses)
1297 basic_block *body, bb;
1298 unsigned i;
1299 int count_ref = 0;
1300 rtx_insn *insn;
1302 body = get_loop_body (loop);
1303 for (i = 0; i < loop->num_nodes; i++)
1305 bb = body[i];
1307 FOR_BB_INSNS (bb, insn)
1308 if (!rtx_referenced_p (reg, insn))
1309 continue;
1310 else if (DEBUG_INSN_P (insn))
1311 ++*debug_uses;
1312 else if (++count_ref > 1)
1313 break;
1315 free (body);
1316 return (count_ref == 1);
1319 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1321 static void
1322 reset_debug_uses_in_loop (class loop *loop, rtx reg, int debug_uses)
1324 basic_block *body, bb;
1325 unsigned i;
1326 rtx_insn *insn;
1328 body = get_loop_body (loop);
1329 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1331 bb = body[i];
1333 FOR_BB_INSNS (bb, insn)
1334 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1335 continue;
1336 else
1338 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1339 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1340 if (!--debug_uses)
1341 break;
1344 free (body);
1347 /* Determine whether INSN contains an accumulator
1348 which can be expanded into separate copies,
1349 one for each copy of the LOOP body.
1351 for (i = 0 ; i < n; i++)
1352 sum += a[i];
1356 sum += a[i]
1357 ....
1358 i = i+1;
1359 sum1 += a[i]
1360 ....
1361 i = i+1
1362 sum2 += a[i];
1363 ....
1365 Return NULL if INSN contains no opportunity for expansion of accumulator.
1366 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1367 information and return a pointer to it.
1370 static struct var_to_expand *
1371 analyze_insn_to_expand_var (class loop *loop, rtx_insn *insn)
1373 rtx set, dest, src;
1374 struct var_to_expand *ves;
1375 unsigned accum_pos;
1376 enum rtx_code code;
1377 int debug_uses = 0;
1379 set = single_set (insn);
1380 if (!set)
1381 return NULL;
1383 dest = SET_DEST (set);
1384 src = SET_SRC (set);
1385 code = GET_CODE (src);
1387 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1388 return NULL;
1390 if (FLOAT_MODE_P (GET_MODE (dest)))
1392 if (!flag_associative_math)
1393 return NULL;
1394 /* In the case of FMA, we're also changing the rounding. */
1395 if (code == FMA && !flag_unsafe_math_optimizations)
1396 return NULL;
1399 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1400 in MD. But if there is no optab to generate the insn, we cannot
1401 perform the variable expansion. This can happen if an MD provides
1402 an insn but not a named pattern to generate it, for example to avoid
1403 producing code that needs additional mode switches like for x87/mmx.
1405 So we check have_insn_for which looks for an optab for the operation
1406 in SRC. If it doesn't exist, we can't perform the expansion even
1407 though INSN is valid. */
1408 if (!have_insn_for (code, GET_MODE (src)))
1409 return NULL;
1411 if (!REG_P (dest)
1412 && !(GET_CODE (dest) == SUBREG
1413 && REG_P (SUBREG_REG (dest))))
1414 return NULL;
1416 /* Find the accumulator use within the operation. */
1417 if (code == FMA)
1419 /* We only support accumulation via FMA in the ADD position. */
1420 if (!rtx_equal_p (dest, XEXP (src, 2)))
1421 return NULL;
1422 accum_pos = 2;
1424 else if (rtx_equal_p (dest, XEXP (src, 0)))
1425 accum_pos = 0;
1426 else if (rtx_equal_p (dest, XEXP (src, 1)))
1428 /* The method of expansion that we are using; which includes the
1429 initialization of the expansions with zero and the summation of
1430 the expansions at the end of the computation will yield wrong
1431 results for (x = something - x) thus avoid using it in that case. */
1432 if (code == MINUS)
1433 return NULL;
1434 accum_pos = 1;
1436 else
1437 return NULL;
1439 /* It must not otherwise be used. */
1440 if (code == FMA)
1442 if (rtx_referenced_p (dest, XEXP (src, 0))
1443 || rtx_referenced_p (dest, XEXP (src, 1)))
1444 return NULL;
1446 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1447 return NULL;
1449 /* It must be used in exactly one insn. */
1450 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1451 return NULL;
1453 if (dump_file)
1455 fprintf (dump_file, "\n;; Expanding Accumulator ");
1456 print_rtl (dump_file, dest);
1457 fprintf (dump_file, "\n");
1460 if (debug_uses)
1461 /* Instead of resetting the debug insns, we could replace each
1462 debug use in the loop with the sum or product of all expanded
1463 accumulators. Since we'll only know of all expansions at the
1464 end, we'd have to keep track of which vars_to_expand a debug
1465 insn in the loop references, take note of each copy of the
1466 debug insn during unrolling, and when it's all done, compute
1467 the sum or product of each variable and adjust the original
1468 debug insn and each copy thereof. What a pain! */
1469 reset_debug_uses_in_loop (loop, dest, debug_uses);
1471 /* Record the accumulator to expand. */
1472 ves = XNEW (struct var_to_expand);
1473 ves->insn = insn;
1474 ves->reg = copy_rtx (dest);
1475 ves->var_expansions.create (1);
1476 ves->next = NULL;
1477 ves->op = GET_CODE (src);
1478 ves->expansion_count = 0;
1479 ves->reuse_expansion = 0;
1480 return ves;
1483 /* Determine whether there is an induction variable in INSN that
1484 we would like to split during unrolling.
1486 I.e. replace
1488 i = i + 1;
1490 i = i + 1;
1492 i = i + 1;
1495 type chains by
1497 i0 = i + 1
1499 i = i0 + 1
1501 i = i0 + 2
1504 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1505 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1506 pointer to it. */
1508 static struct iv_to_split *
1509 analyze_iv_to_split_insn (rtx_insn *insn)
1511 rtx set, dest;
1512 class rtx_iv iv;
1513 struct iv_to_split *ivts;
1514 scalar_int_mode mode;
1515 bool ok;
1517 /* For now we just split the basic induction variables. Later this may be
1518 extended for example by selecting also addresses of memory references. */
1519 set = single_set (insn);
1520 if (!set)
1521 return NULL;
1523 dest = SET_DEST (set);
1524 if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
1525 return NULL;
1527 if (!biv_p (insn, mode, dest))
1528 return NULL;
1530 ok = iv_analyze_result (insn, dest, &iv);
1532 /* This used to be an assert under the assumption that if biv_p returns
1533 true that iv_analyze_result must also return true. However, that
1534 assumption is not strictly correct as evidenced by pr25569.
1536 Returning NULL when iv_analyze_result returns false is safe and
1537 avoids the problems in pr25569 until the iv_analyze_* routines
1538 can be fixed, which is apparently hard and time consuming
1539 according to their author. */
1540 if (! ok)
1541 return NULL;
1543 if (iv.step == const0_rtx
1544 || iv.mode != iv.extend_mode)
1545 return NULL;
1547 /* Record the insn to split. */
1548 ivts = XNEW (struct iv_to_split);
1549 ivts->insn = insn;
1550 ivts->orig_var = dest;
1551 ivts->base_var = NULL_RTX;
1552 ivts->step = iv.step;
1553 ivts->next = NULL;
1555 return ivts;
1558 /* Determines which of insns in LOOP can be optimized.
1559 Return a OPT_INFO struct with the relevant hash tables filled
1560 with all insns to be optimized. The FIRST_NEW_BLOCK field
1561 is undefined for the return value. */
1563 static struct opt_info *
1564 analyze_insns_in_loop (class loop *loop)
1566 basic_block *body, bb;
1567 unsigned i;
1568 struct opt_info *opt_info = XCNEW (struct opt_info);
1569 rtx_insn *insn;
1570 struct iv_to_split *ivts = NULL;
1571 struct var_to_expand *ves = NULL;
1572 iv_to_split **slot1;
1573 var_to_expand **slot2;
1574 auto_vec<edge> edges = get_loop_exit_edges (loop);
1575 edge exit;
1576 bool can_apply = false;
1578 iv_analysis_loop_init (loop);
1580 body = get_loop_body (loop);
1582 if (flag_split_ivs_in_unroller)
1584 opt_info->insns_to_split
1585 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1586 opt_info->iv_to_split_head = NULL;
1587 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1590 /* Record the loop exit bb and loop preheader before the unrolling. */
1591 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1593 if (edges.length () == 1)
1595 exit = edges[0];
1596 if (!(exit->flags & EDGE_COMPLEX))
1598 opt_info->loop_exit = split_edge (exit);
1599 can_apply = true;
1603 if (flag_variable_expansion_in_unroller
1604 && can_apply)
1606 opt_info->insns_with_var_to_expand
1607 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1608 opt_info->var_to_expand_head = NULL;
1609 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1612 for (i = 0; i < loop->num_nodes; i++)
1614 bb = body[i];
1615 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1616 continue;
1618 FOR_BB_INSNS (bb, insn)
1620 if (!INSN_P (insn))
1621 continue;
1623 if (opt_info->insns_to_split)
1624 ivts = analyze_iv_to_split_insn (insn);
1626 if (ivts)
1628 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1629 gcc_assert (*slot1 == NULL);
1630 *slot1 = ivts;
1631 *opt_info->iv_to_split_tail = ivts;
1632 opt_info->iv_to_split_tail = &ivts->next;
1633 continue;
1636 if (opt_info->insns_with_var_to_expand)
1637 ves = analyze_insn_to_expand_var (loop, insn);
1639 if (ves)
1641 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1642 gcc_assert (*slot2 == NULL);
1643 *slot2 = ves;
1644 *opt_info->var_to_expand_tail = ves;
1645 opt_info->var_to_expand_tail = &ves->next;
1650 free (body);
1651 return opt_info;
1654 /* Called just before loop duplication. Records start of duplicated area
1655 to OPT_INFO. */
1657 static void
1658 opt_info_start_duplication (struct opt_info *opt_info)
1660 if (opt_info)
1661 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1664 /* Determine the number of iterations between initialization of the base
1665 variable and the current copy (N_COPY). N_COPIES is the total number
1666 of newly created copies. UNROLLING is true if we are unrolling
1667 (not peeling) the loop. */
1669 static unsigned
1670 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1672 if (unrolling)
1674 /* If we are unrolling, initialization is done in the original loop
1675 body (number 0). */
1676 return n_copy;
1678 else
1680 /* If we are peeling, the copy in that the initialization occurs has
1681 number 1. The original loop (number 0) is the last. */
1682 if (n_copy)
1683 return n_copy - 1;
1684 else
1685 return n_copies;
1689 /* Allocate basic variable for the induction variable chain. */
1691 static void
1692 allocate_basic_variable (struct iv_to_split *ivts)
1694 rtx expr = SET_SRC (single_set (ivts->insn));
1696 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1699 /* Insert initialization of basic variable of IVTS before INSN, taking
1700 the initial value from INSN. */
1702 static void
1703 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1705 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1706 rtx_insn *seq;
1708 start_sequence ();
1709 expr = force_operand (expr, ivts->base_var);
1710 if (expr != ivts->base_var)
1711 emit_move_insn (ivts->base_var, expr);
1712 seq = get_insns ();
1713 end_sequence ();
1715 emit_insn_before (seq, insn);
1718 /* Replace the use of induction variable described in IVTS in INSN
1719 by base variable + DELTA * step. */
1721 static void
1722 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1724 rtx expr, *loc, incr, var;
1725 rtx_insn *seq;
1726 machine_mode mode = GET_MODE (ivts->base_var);
1727 rtx src, dest, set;
1729 /* Construct base + DELTA * step. */
1730 if (!delta)
1731 expr = ivts->base_var;
1732 else
1734 incr = simplify_gen_binary (MULT, mode,
1735 copy_rtx (ivts->step),
1736 gen_int_mode (delta, mode));
1737 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1738 ivts->base_var, incr);
1741 /* Figure out where to do the replacement. */
1742 loc = &SET_SRC (single_set (insn));
1744 /* If we can make the replacement right away, we're done. */
1745 if (validate_change (insn, loc, expr, 0))
1746 return;
1748 /* Otherwise, force EXPR into a register and try again. */
1749 start_sequence ();
1750 var = gen_reg_rtx (mode);
1751 expr = force_operand (expr, var);
1752 if (expr != var)
1753 emit_move_insn (var, expr);
1754 seq = get_insns ();
1755 end_sequence ();
1756 emit_insn_before (seq, insn);
1758 if (validate_change (insn, loc, var, 0))
1759 return;
1761 /* The last chance. Try recreating the assignment in insn
1762 completely from scratch. */
1763 set = single_set (insn);
1764 gcc_assert (set);
1766 start_sequence ();
1767 *loc = var;
1768 src = copy_rtx (SET_SRC (set));
1769 dest = copy_rtx (SET_DEST (set));
1770 src = force_operand (src, dest);
1771 if (src != dest)
1772 emit_move_insn (dest, src);
1773 seq = get_insns ();
1774 end_sequence ();
1776 emit_insn_before (seq, insn);
1777 delete_insn (insn);
1781 /* Return one expansion of the accumulator recorded in struct VE. */
1783 static rtx
1784 get_expansion (struct var_to_expand *ve)
1786 rtx reg;
1788 if (ve->reuse_expansion == 0)
1789 reg = ve->reg;
1790 else
1791 reg = ve->var_expansions[ve->reuse_expansion - 1];
1793 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1794 ve->reuse_expansion = 0;
1795 else
1796 ve->reuse_expansion++;
1798 return reg;
1802 /* Given INSN replace the uses of the accumulator recorded in VE
1803 with a new register. */
1805 static void
1806 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1808 rtx new_reg, set;
1809 bool really_new_expansion = false;
1811 set = single_set (insn);
1812 gcc_assert (set);
1814 /* Generate a new register only if the expansion limit has not been
1815 reached. Else reuse an already existing expansion. */
1816 if (param_max_variable_expansions > ve->expansion_count)
1818 really_new_expansion = true;
1819 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1821 else
1822 new_reg = get_expansion (ve);
1824 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1825 if (apply_change_group ())
1826 if (really_new_expansion)
1828 ve->var_expansions.safe_push (new_reg);
1829 ve->expansion_count++;
1833 /* Initialize the variable expansions in loop preheader. PLACE is the
1834 loop-preheader basic block where the initialization of the
1835 expansions should take place. The expansions are initialized with
1836 (-0) when the operation is plus or minus to honor sign zero. This
1837 way we can prevent cases where the sign of the final result is
1838 effected by the sign of the expansion. Here is an example to
1839 demonstrate this:
1841 for (i = 0 ; i < n; i++)
1842 sum += something;
1846 sum += something
1847 ....
1848 i = i+1;
1849 sum1 += something
1850 ....
1851 i = i+1
1852 sum2 += something;
1853 ....
1855 When SUM is initialized with -zero and SOMETHING is also -zero; the
1856 final result of sum should be -zero thus the expansions sum1 and sum2
1857 should be initialized with -zero as well (otherwise we will get +zero
1858 as the final result). */
1860 static void
1861 insert_var_expansion_initialization (struct var_to_expand *ve,
1862 basic_block place)
1864 rtx_insn *seq;
1865 rtx var, zero_init;
1866 unsigned i;
1867 machine_mode mode = GET_MODE (ve->reg);
1868 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1870 if (ve->var_expansions.length () == 0)
1871 return;
1873 start_sequence ();
1874 switch (ve->op)
1876 case FMA:
1877 /* Note that we only accumulate FMA via the ADD operand. */
1878 case PLUS:
1879 case MINUS:
1880 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1882 if (honor_signed_zero_p)
1883 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1884 else
1885 zero_init = CONST0_RTX (mode);
1886 emit_move_insn (var, zero_init);
1888 break;
1890 case MULT:
1891 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1893 zero_init = CONST1_RTX (GET_MODE (var));
1894 emit_move_insn (var, zero_init);
1896 break;
1898 default:
1899 gcc_unreachable ();
1902 seq = get_insns ();
1903 end_sequence ();
1905 emit_insn_after (seq, BB_END (place));
1908 /* Combine the variable expansions at the loop exit. PLACE is the
1909 loop exit basic block where the summation of the expansions should
1910 take place. */
1912 static void
1913 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1915 rtx sum = ve->reg;
1916 rtx expr, var;
1917 rtx_insn *seq, *insn;
1918 unsigned i;
1920 if (ve->var_expansions.length () == 0)
1921 return;
1923 /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1924 it both here and as the destination of the assignment. */
1925 sum = copy_rtx (sum);
1926 start_sequence ();
1927 switch (ve->op)
1929 case FMA:
1930 /* Note that we only accumulate FMA via the ADD operand. */
1931 case PLUS:
1932 case MINUS:
1933 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1934 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1935 break;
1937 case MULT:
1938 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1939 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1940 break;
1942 default:
1943 gcc_unreachable ();
1946 expr = force_operand (sum, ve->reg);
1947 if (expr != ve->reg)
1948 emit_move_insn (ve->reg, expr);
1949 seq = get_insns ();
1950 end_sequence ();
1952 insn = BB_HEAD (place);
1953 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1954 insn = NEXT_INSN (insn);
1956 emit_insn_after (seq, insn);
1959 /* Strip away REG_EQUAL notes for IVs we're splitting.
1961 Updating REG_EQUAL notes for IVs we split is tricky: We
1962 cannot tell until after unrolling, DF-rescanning, and liveness
1963 updating, whether an EQ_USE is reached by the split IV while
1964 the IV reg is still live. See PR55006.
1966 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1967 because RTL loop-iv requires us to defer rescanning insns and
1968 any notes attached to them. So resort to old techniques... */
1970 static void
1971 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1973 struct iv_to_split *ivts;
1974 rtx note = find_reg_equal_equiv_note (insn);
1975 if (! note)
1976 return;
1977 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1978 if (reg_mentioned_p (ivts->orig_var, note))
1980 remove_note (insn, note);
1981 return;
1985 /* Apply loop optimizations in loop copies using the
1986 data which gathered during the unrolling. Structure
1987 OPT_INFO record that data.
1989 UNROLLING is true if we unrolled (not peeled) the loop.
1990 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1991 the loop (as it should happen in complete unrolling, but not in ordinary
1992 peeling of the loop). */
1994 static void
1995 apply_opt_in_copies (struct opt_info *opt_info,
1996 unsigned n_copies, bool unrolling,
1997 bool rewrite_original_loop)
1999 unsigned i, delta;
2000 basic_block bb, orig_bb;
2001 rtx_insn *insn, *orig_insn, *next;
2002 struct iv_to_split ivts_templ, *ivts;
2003 struct var_to_expand ve_templ, *ves;
2005 /* Sanity check -- we need to put initialization in the original loop
2006 body. */
2007 gcc_assert (!unrolling || rewrite_original_loop);
2009 /* Allocate the basic variables (i0). */
2010 if (opt_info->insns_to_split)
2011 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2012 allocate_basic_variable (ivts);
2014 for (i = opt_info->first_new_block;
2015 i < (unsigned) last_basic_block_for_fn (cfun);
2016 i++)
2018 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2019 orig_bb = get_bb_original (bb);
2021 /* bb->aux holds position in copy sequence initialized by
2022 duplicate_loop_to_header_edge. */
2023 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2024 unrolling);
2025 bb->aux = 0;
2026 orig_insn = BB_HEAD (orig_bb);
2027 FOR_BB_INSNS_SAFE (bb, insn, next)
2029 if (!INSN_P (insn)
2030 || (DEBUG_BIND_INSN_P (insn)
2031 && INSN_VAR_LOCATION_DECL (insn)
2032 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2033 continue;
2035 while (!INSN_P (orig_insn)
2036 || (DEBUG_BIND_INSN_P (orig_insn)
2037 && INSN_VAR_LOCATION_DECL (orig_insn)
2038 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2039 == LABEL_DECL)))
2040 orig_insn = NEXT_INSN (orig_insn);
2042 ivts_templ.insn = orig_insn;
2043 ve_templ.insn = orig_insn;
2045 /* Apply splitting iv optimization. */
2046 if (opt_info->insns_to_split)
2048 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2050 ivts = opt_info->insns_to_split->find (&ivts_templ);
2052 if (ivts)
2054 gcc_assert (GET_CODE (PATTERN (insn))
2055 == GET_CODE (PATTERN (orig_insn)));
2057 if (!delta)
2058 insert_base_initialization (ivts, insn);
2059 split_iv (ivts, insn, delta);
2062 /* Apply variable expansion optimization. */
2063 if (unrolling && opt_info->insns_with_var_to_expand)
2065 ves = (struct var_to_expand *)
2066 opt_info->insns_with_var_to_expand->find (&ve_templ);
2067 if (ves)
2069 gcc_assert (GET_CODE (PATTERN (insn))
2070 == GET_CODE (PATTERN (orig_insn)));
2071 expand_var_during_unrolling (ves, insn);
2074 orig_insn = NEXT_INSN (orig_insn);
2078 if (!rewrite_original_loop)
2079 return;
2081 /* Initialize the variable expansions in the loop preheader
2082 and take care of combining them at the loop exit. */
2083 if (opt_info->insns_with_var_to_expand)
2085 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2086 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2087 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2088 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2091 /* Rewrite also the original loop body. Find them as originals of the blocks
2092 in the last copied iteration, i.e. those that have
2093 get_bb_copy (get_bb_original (bb)) == bb. */
2094 for (i = opt_info->first_new_block;
2095 i < (unsigned) last_basic_block_for_fn (cfun);
2096 i++)
2098 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2099 orig_bb = get_bb_original (bb);
2100 if (get_bb_copy (orig_bb) != bb)
2101 continue;
2103 delta = determine_split_iv_delta (0, n_copies, unrolling);
2104 for (orig_insn = BB_HEAD (orig_bb);
2105 orig_insn != NEXT_INSN (BB_END (bb));
2106 orig_insn = next)
2108 next = NEXT_INSN (orig_insn);
2110 if (!INSN_P (orig_insn))
2111 continue;
2113 ivts_templ.insn = orig_insn;
2114 if (opt_info->insns_to_split)
2116 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2118 ivts = (struct iv_to_split *)
2119 opt_info->insns_to_split->find (&ivts_templ);
2120 if (ivts)
2122 if (!delta)
2123 insert_base_initialization (ivts, orig_insn);
2124 split_iv (ivts, orig_insn, delta);
2125 continue;
2133 /* Release OPT_INFO. */
2135 static void
2136 free_opt_info (struct opt_info *opt_info)
2138 delete opt_info->insns_to_split;
2139 opt_info->insns_to_split = NULL;
2140 if (opt_info->insns_with_var_to_expand)
2142 struct var_to_expand *ves;
2144 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2145 ves->var_expansions.release ();
2146 delete opt_info->insns_with_var_to_expand;
2147 opt_info->insns_with_var_to_expand = NULL;
2149 free (opt_info);