This commit was manufactured by cvs2svn to create branch
[official-gcc.git] / gcc / loop-unroll.c
blobaae175720feacb1ae19cb4f37ea92038b94f6ebe
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
28 #include "cfgloop.h"
29 #include "cfglayout.h"
30 #include "params.h"
31 #include "output.h"
32 #include "expr.h"
33 /* We need to use the macro exact_log2. */
34 #include "toplev.h"
36 /* This pass performs loop unrolling and peeling. We only perform these
37 optimizations on innermost loops (with single exception) because
38 the impact on performance is greatest here, and we want to avoid
39 unnecessary code size growth. The gain is caused by greater sequentiality
40 of code, better code to optimize for further passes and in some cases
41 by fewer testings of exit conditions. The main problem is code growth,
42 that impacts performance negatively due to effect of caches.
44 What we do:
46 -- complete peeling of once-rolling loops; this is the above mentioned
47 exception, as this causes loop to be cancelled completely and
48 does not cause code growth
49 -- complete peeling of loops that roll (small) constant times.
50 -- simple peeling of first iterations of loops that do not roll much
51 (according to profile feedback)
52 -- unrolling of loops that roll constant times; this is almost always
53 win, as we get rid of exit condition tests.
54 -- unrolling of loops that roll number of times that we can compute
55 in runtime; we also get rid of exit condition tests here, but there
56 is the extra expense for calculating the number of iterations
57 -- simple unrolling of remaining loops; this is performed only if we
58 are asked to, as the gain is questionable in this case and often
59 it may even slow down the code
60 For more detailed descriptions of each of those, see comments at
61 appropriate function below.
63 There is a lot of parameters (defined and described in params.def) that
64 control how much we unroll/peel.
66 ??? A great problem is that we don't have a good way how to determine
67 how many times we should unroll the loop; the experiments I have made
68 showed that this choice may affect performance in order of several %.
71 static void decide_unrolling_and_peeling (struct loops *, int);
72 static void peel_loops_completely (struct loops *, int);
73 static void decide_peel_simple (struct loop *, int);
74 static void decide_peel_once_rolling (struct loop *, int);
75 static void decide_peel_completely (struct loop *, int);
76 static void decide_unroll_stupid (struct loop *, int);
77 static void decide_unroll_constant_iterations (struct loop *, int);
78 static void decide_unroll_runtime_iterations (struct loop *, int);
79 static void peel_loop_simple (struct loops *, struct loop *);
80 static void peel_loop_completely (struct loops *, struct loop *);
81 static void unroll_loop_stupid (struct loops *, struct loop *);
82 static void unroll_loop_constant_iterations (struct loops *, struct loop *);
83 static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
84 static void expand_bct (edge, int);
85 static bool discard_increment (struct loop *);
87 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
88 void
89 unroll_and_peel_loops (struct loops *loops, int flags)
91 struct loop *loop, *next;
92 int check;
94 /* First perform complete loop peeling (it is almost surely a win,
95 and affects parameters for further decision a lot). */
96 peel_loops_completely (loops, flags);
98 /* Now decide rest of unrolling and peeling. */
99 decide_unrolling_and_peeling (loops, flags);
101 loop = loops->tree_root;
102 while (loop->inner)
103 loop = loop->inner;
105 /* Scan the loops, inner ones first. */
106 while (loop != loops->tree_root)
108 if (loop->next)
110 next = loop->next;
111 while (next->inner)
112 next = next->inner;
114 else
115 next = loop->outer;
117 check = 1;
118 /* And perform the appropriate transformations. */
119 switch (loop->lpt_decision.decision)
121 case LPT_PEEL_COMPLETELY:
122 /* Already done. */
123 abort ();
124 case LPT_PEEL_SIMPLE:
125 peel_loop_simple (loops, loop);
126 break;
127 case LPT_UNROLL_CONSTANT:
128 unroll_loop_constant_iterations (loops, loop);
129 break;
130 case LPT_UNROLL_RUNTIME:
131 unroll_loop_runtime_iterations (loops, loop);
132 break;
133 case LPT_UNROLL_STUPID:
134 unroll_loop_stupid (loops, loop);
135 break;
136 case LPT_NONE:
137 check = 0;
138 break;
139 default:
140 abort ();
142 if (check)
144 #ifdef ENABLE_CHECKING
145 verify_dominators (CDI_DOMINATORS);
146 verify_loop_structure (loops);
147 #endif
149 loop = next;
153 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
154 static void
155 peel_loops_completely (struct loops *loops, int flags)
157 struct loop *loop, *next;
159 loop = loops->tree_root;
160 while (loop->inner)
161 loop = loop->inner;
163 while (loop != loops->tree_root)
165 if (loop->next)
167 next = loop->next;
168 while (next->inner)
169 next = next->inner;
171 else
172 next = loop->outer;
174 loop->lpt_decision.decision = LPT_NONE;
175 loop->has_desc = 0;
177 if (rtl_dump_file)
178 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
179 loop->num);
181 loop->ninsns = num_loop_insns (loop);
183 decide_peel_once_rolling (loop, flags);
184 if (loop->lpt_decision.decision == LPT_NONE)
185 decide_peel_completely (loop, flags);
187 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
189 peel_loop_completely (loops, loop);
190 #ifdef ENABLE_CHECKING
191 verify_dominators (CDI_DOMINATORS);
192 verify_loop_structure (loops);
193 #endif
195 loop = next;
199 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
200 static void
201 decide_unrolling_and_peeling (struct loops *loops, int flags)
203 struct loop *loop = loops->tree_root, *next;
205 while (loop->inner)
206 loop = loop->inner;
208 /* Scan the loops, inner ones first. */
209 while (loop != loops->tree_root)
211 if (loop->next)
213 next = loop->next;
214 while (next->inner)
215 next = next->inner;
217 else
218 next = loop->outer;
220 loop->lpt_decision.decision = LPT_NONE;
222 if (rtl_dump_file)
223 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
225 /* Do not peel cold areas. */
226 if (!maybe_hot_bb_p (loop->header))
228 if (rtl_dump_file)
229 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
230 loop = next;
231 continue;
234 /* Can the loop be manipulated? */
235 if (!can_duplicate_loop_p (loop))
237 if (rtl_dump_file)
238 fprintf (rtl_dump_file,
239 ";; Not considering loop, cannot duplicate\n");
240 loop = next;
241 continue;
244 /* Skip non-innermost loops. */
245 if (loop->inner)
247 if (rtl_dump_file)
248 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
249 loop = next;
250 continue;
253 loop->ninsns = num_loop_insns (loop);
254 loop->av_ninsns = average_num_loop_insns (loop);
256 /* Try transformations one by one in decreasing order of
257 priority. */
259 decide_unroll_constant_iterations (loop, flags);
260 if (loop->lpt_decision.decision == LPT_NONE)
261 decide_unroll_runtime_iterations (loop, flags);
262 if (loop->lpt_decision.decision == LPT_NONE)
263 decide_unroll_stupid (loop, flags);
264 if (loop->lpt_decision.decision == LPT_NONE)
265 decide_peel_simple (loop, flags);
267 loop = next;
271 /* Decide whether the LOOP is once rolling and suitable for complete
272 peeling. */
273 static void
274 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
276 if (rtl_dump_file)
277 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
279 /* Is the loop small enough? */
280 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
282 if (rtl_dump_file)
283 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
284 return;
287 /* Check for simple loops. */
288 loop->simple = simple_loop_p (loop, &loop->desc);
289 loop->has_desc = 1;
291 /* Check number of iterations. */
292 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0)
294 if (rtl_dump_file)
295 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
296 return;
299 /* Success. */
300 if (rtl_dump_file)
301 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
302 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
305 /* Decide whether the LOOP is suitable for complete peeling. */
306 static void
307 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
309 unsigned npeel;
311 if (rtl_dump_file)
312 fprintf (rtl_dump_file, ";; Considering peeling completely\n");
314 /* Skip non-innermost loops. */
315 if (loop->inner)
317 if (rtl_dump_file)
318 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
319 return;
322 /* Do not peel cold areas. */
323 if (!maybe_hot_bb_p (loop->header))
325 if (rtl_dump_file)
326 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
327 return;
330 /* Can the loop be manipulated? */
331 if (!can_duplicate_loop_p (loop))
333 if (rtl_dump_file)
334 fprintf (rtl_dump_file,
335 ";; Not considering loop, cannot duplicate\n");
336 return;
339 /* npeel = number of iterations to peel. */
340 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
341 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
342 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
344 /* Is the loop small enough? */
345 if (!npeel)
347 if (rtl_dump_file)
348 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
349 return;
352 /* Check for simple loops. */
353 if (!loop->has_desc)
355 loop->simple = simple_loop_p (loop, &loop->desc);
356 loop->has_desc = 1;
359 /* Check number of iterations. */
360 if (!loop->simple || !loop->desc.const_iter)
362 if (rtl_dump_file)
363 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
364 return;
367 if (loop->desc.niter > npeel - 1)
369 if (rtl_dump_file)
371 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
372 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
373 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
375 return;
378 /* Success. */
379 if (rtl_dump_file)
380 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
381 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
384 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
385 completely. The transformation done:
387 for (i = 0; i < 4; i++)
388 body;
392 i = 0;
393 body; i++;
394 body; i++;
395 body; i++;
396 body; i++;
398 static void
399 peel_loop_completely (struct loops *loops, struct loop *loop)
401 sbitmap wont_exit;
402 unsigned HOST_WIDE_INT npeel;
403 unsigned n_remove_edges, i;
404 edge *remove_edges;
405 struct loop_desc *desc = &loop->desc;
406 bool discard_inc = false;
407 bool is_bct;
409 if ((is_bct = is_bct_cond (BB_END (loop->desc.out_edge->src))))
410 discard_inc = discard_increment (loop);
412 npeel = desc->niter;
414 if (npeel)
416 wont_exit = sbitmap_alloc (npeel + 1);
417 sbitmap_ones (wont_exit);
418 RESET_BIT (wont_exit, 0);
419 if (desc->may_be_zero)
420 RESET_BIT (wont_exit, 1);
422 remove_edges = xcalloc (npeel, sizeof (edge));
423 n_remove_edges = 0;
425 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
426 loops, npeel,
427 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
428 DLTHE_FLAG_UPDATE_FREQ))
429 abort ();
431 free (wont_exit);
433 /* Expand the branch and count. */
434 if (is_bct)
435 for (i = 0; i < n_remove_edges; i++)
436 expand_bct (remove_edges[i], discard_inc);
438 /* Remove the exit edges. */
439 for (i = 0; i < n_remove_edges; i++)
440 remove_path (loops, remove_edges[i]);
441 free (remove_edges);
444 /* Expand the branch and count. */
445 if (is_bct)
446 expand_bct (desc->in_edge, discard_inc);
448 /* Now remove the unreachable part of the last iteration and cancel
449 the loop. */
450 remove_path (loops, desc->in_edge);
452 if (rtl_dump_file)
453 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
456 /* Decide whether to unroll LOOP iterating constant number of times and how much. */
457 static void
458 decide_unroll_constant_iterations (struct loop *loop, int flags)
460 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
462 if (!(flags & UAP_UNROLL))
464 /* We were not asked to, just return back silently. */
465 return;
468 if (rtl_dump_file)
469 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
471 /* nunroll = total number of copies of the original loop body in
472 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
473 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
474 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
475 if (nunroll > nunroll_by_av)
476 nunroll = nunroll_by_av;
477 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
478 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
480 /* Skip big loops. */
481 if (nunroll <= 1)
483 if (rtl_dump_file)
484 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
485 return;
488 /* Check for simple loops. */
489 if (!loop->has_desc)
491 loop->simple = simple_loop_p (loop, &loop->desc);
492 loop->has_desc = 1;
495 /* Check number of iterations. */
496 if (!loop->simple || !loop->desc.const_iter)
498 if (rtl_dump_file)
499 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
500 return;
503 /* Check whether the loop rolls enough to consider. */
504 if (loop->desc.niter < 2 * nunroll)
506 if (rtl_dump_file)
507 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
508 return;
511 /* Success; now compute number of iterations to unroll. We alter
512 nunroll so that as few as possible copies of loop body are
513 necessary, while still not decreasing the number of unrollings
514 too much (at most by 1). */
515 best_copies = 2 * nunroll + 10;
517 i = 2 * nunroll + 2;
518 if ((unsigned) i - 1 >= loop->desc.niter)
519 i = loop->desc.niter - 2;
521 for (; i >= nunroll - 1; i--)
523 unsigned exit_mod = loop->desc.niter % (i + 1);
525 if (loop->desc.postincr)
526 n_copies = exit_mod + i + 1;
527 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
528 n_copies = exit_mod + i + 2;
529 else
530 n_copies = i + 1;
532 if (n_copies < best_copies)
534 best_copies = n_copies;
535 best_unroll = i;
539 if (rtl_dump_file)
540 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
541 best_unroll + 1, best_copies, nunroll);
543 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
544 loop->lpt_decision.times = best_unroll;
547 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
548 times. The transformation does this:
550 for (i = 0; i < 102; i++)
551 body;
555 i = 0;
556 body; i++;
557 body; i++;
558 while (i < 102)
560 body; i++;
561 body; i++;
562 body; i++;
563 body; i++;
566 static void
567 unroll_loop_constant_iterations (struct loops *loops, struct loop *loop)
569 unsigned HOST_WIDE_INT niter;
570 unsigned exit_mod;
571 sbitmap wont_exit;
572 unsigned n_remove_edges, i;
573 edge *remove_edges;
574 unsigned max_unroll = loop->lpt_decision.times;
575 struct loop_desc *desc = &loop->desc;
576 bool discard_inc = false;
577 bool is_bct;
579 niter = desc->niter;
581 if (niter <= (unsigned) max_unroll + 1)
582 abort (); /* Should not get here (such loop should be peeled instead). */
584 exit_mod = niter % (max_unroll + 1);
586 wont_exit = sbitmap_alloc (max_unroll + 1);
587 sbitmap_ones (wont_exit);
589 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
590 n_remove_edges = 0;
592 /* For a loop ending with a branch and count for which the increment
593 of the count register will be discarded, adjust the initialization of
594 the count register. */
595 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
596 && (discard_inc = discard_increment (loop)))
598 rtx ini_var;
600 rtx init_code;
601 int n_peel, new_bct_value;
603 /* Get expression for number of iterations. */
604 start_sequence ();
606 n_peel = (niter+1) % (max_unroll+1);
607 new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ;
608 ini_var = GEN_INT (new_bct_value);
610 emit_move_insn (desc->var, ini_var);
611 init_code = get_insns ();
612 end_sequence ();
614 loop_split_edge_with (loop_preheader_edge (loop), init_code);
617 if (desc->postincr)
619 /* Counter is incremented after the exit test; leave exit test
620 in the first copy, so that the loops that start with test
621 of exit condition have continuous body after unrolling. */
623 if (rtl_dump_file)
624 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
626 /* Peel exit_mod iterations. */
627 RESET_BIT (wont_exit, 0);
628 if (desc->may_be_zero)
629 RESET_BIT (wont_exit, 1);
631 if (exit_mod
632 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
633 loops, exit_mod,
634 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
635 DLTHE_FLAG_UPDATE_FREQ))
636 abort ();
638 SET_BIT (wont_exit, 1);
640 else
642 /* Leave exit test in last copy, for the same reason as above if
643 the loop tests the condition at the end of loop body. */
645 if (rtl_dump_file)
646 fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
648 /* We know that niter >= max_unroll + 2; so we do not need to care of
649 case when we would exit before reaching the loop. So just peel
650 exit_mod + 1 iterations.
652 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
654 RESET_BIT (wont_exit, 0);
655 if (desc->may_be_zero)
656 RESET_BIT (wont_exit, 1);
658 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
659 loops, exit_mod + 1,
660 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
661 DLTHE_FLAG_UPDATE_FREQ))
662 abort ();
664 SET_BIT (wont_exit, 0);
665 SET_BIT (wont_exit, 1);
668 RESET_BIT (wont_exit, max_unroll);
671 /* Now unroll the loop. */
672 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
673 loops, max_unroll,
674 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
675 DLTHE_FLAG_UPDATE_FREQ))
676 abort ();
678 free (wont_exit);
680 /* Expand the branch and count. */
681 if (is_bct)
682 for (i = 0; i < n_remove_edges; i++)
683 expand_bct (remove_edges[i], discard_inc);
685 /* Remove the edges. */
686 for (i = 0; i < n_remove_edges; i++)
687 remove_path (loops, remove_edges[i]);
688 free (remove_edges);
690 if (rtl_dump_file)
691 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
694 /* Decide whether to unroll LOOP iterating runtime computable number of times
695 and how much. */
696 static void
697 decide_unroll_runtime_iterations (struct loop *loop, int flags)
699 unsigned nunroll, nunroll_by_av, i;
701 if (!(flags & UAP_UNROLL))
703 /* We were not asked to, just return back silently. */
704 return;
707 if (rtl_dump_file)
708 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
710 /* nunroll = total number of copies of the original loop body in
711 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
712 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
713 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
714 if (nunroll > nunroll_by_av)
715 nunroll = nunroll_by_av;
716 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
717 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
719 /* Skip big loops. */
720 if (nunroll <= 1)
722 if (rtl_dump_file)
723 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
724 return;
727 /* Check for simple loops. */
728 if (!loop->has_desc)
730 loop->simple = simple_loop_p (loop, &loop->desc);
731 loop->has_desc = 1;
734 /* Check simpleness. */
735 if (!loop->simple)
737 if (rtl_dump_file)
738 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
739 return;
742 if (loop->desc.const_iter)
744 if (rtl_dump_file)
745 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
746 return;
749 /* If we have profile feedback, check whether the loop rolls. */
750 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
752 if (rtl_dump_file)
753 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
754 return;
757 /* Success; now force nunroll to be power of 2, as we are unable to
758 cope with overflows in computation of number of iterations. */
759 for (i = 1; 2 * i <= nunroll; i *= 2);
761 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
762 loop->lpt_decision.times = i - 1;
765 /* Unroll LOOP for that we are able to count number of iterations in runtime
766 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
767 extra care for case n < 0):
769 for (i = 0; i < n; i++)
770 body;
774 i = 0;
775 mod = n % 4;
777 switch (mod)
779 case 3:
780 body; i++;
781 case 2:
782 body; i++;
783 case 1:
784 body; i++;
785 case 0: ;
788 while (i < n)
790 body; i++;
791 body; i++;
792 body; i++;
793 body; i++;
796 static void
797 unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop)
799 rtx niter, init_code, branch_code, jump, label;
800 unsigned i, j, p;
801 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
802 unsigned n_dom_bbs;
803 sbitmap wont_exit;
804 int may_exit_copy;
805 unsigned n_peel, n_remove_edges;
806 edge *remove_edges, e;
807 bool extra_zero_check, last_may_exit;
808 unsigned max_unroll = loop->lpt_decision.times;
809 struct loop_desc *desc = &loop->desc;
810 bool discard_inc = false;
811 bool is_bct;
813 /* Remember blocks whose dominators will have to be updated. */
814 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
815 n_dom_bbs = 0;
817 body = get_loop_body (loop);
818 for (i = 0; i < loop->num_nodes; i++)
820 unsigned nldom;
821 basic_block *ldom;
823 nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom);
824 for (j = 0; j < nldom; j++)
825 if (!flow_bb_inside_loop_p (loop, ldom[j]))
826 dom_bbs[n_dom_bbs++] = ldom[j];
828 free (ldom);
830 free (body);
832 if (desc->postincr)
834 /* Leave exit in first copy (for explanation why see comment in
835 unroll_loop_constant_iterations). */
836 may_exit_copy = 0;
837 n_peel = max_unroll - 1;
838 extra_zero_check = true;
839 last_may_exit = false;
841 else
843 /* Leave exit in last copy (for explanation why see comment in
844 unroll_loop_constant_iterations). */
845 may_exit_copy = max_unroll;
846 n_peel = max_unroll;
847 extra_zero_check = false;
848 last_may_exit = true;
851 /* Get expression for number of iterations. */
852 start_sequence ();
853 niter = count_loop_iterations (desc, NULL, NULL);
854 if (!niter)
855 abort ();
856 niter = force_operand (niter, NULL);
858 /* Count modulo by ANDing it with max_unroll; we use the fact that
859 the number of unrollings is a power of two, and thus this is correct
860 even if there is overflow in the computation. */
861 niter = expand_simple_binop (GET_MODE (desc->var), AND,
862 niter,
863 GEN_INT (max_unroll),
864 NULL_RTX, 0, OPTAB_LIB_WIDEN);
866 /* For a loop ending with a branch and count for which the increment
867 of the count register will be discarded, adjust the initialization of
868 the count register. */
869 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
870 && (discard_inc = discard_increment (loop)))
872 rtx count, count2, count_unroll_mod;
873 int count_unroll;
875 /* start_sequence (); */
877 count = count_loop_iterations (desc, NULL, NULL);
879 count_unroll = loop->lpt_decision.times+1;
883 count_unroll_mod = GEN_INT (exact_log2 (count_unroll));
884 count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT,
885 count, count_unroll_mod,
886 0, 0, OPTAB_LIB_WIDEN);
888 count2 = expand_simple_binop (GET_MODE (desc->var), PLUS,
889 count, GEN_INT (2),
890 0, 0, OPTAB_LIB_WIDEN);
892 emit_move_insn (desc->var, count2);
895 init_code = get_insns ();
896 end_sequence ();
898 /* Precondition the loop. */
899 loop_split_edge_with (loop_preheader_edge (loop), init_code);
901 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
902 n_remove_edges = 0;
904 wont_exit = sbitmap_alloc (max_unroll + 2);
906 /* Peel the first copy of loop body (almost always we must leave exit test
907 here; the only exception is when we have extra zero check and the number
908 of iterations is reliable (i.e. comes out of NE condition). Also record
909 the place of (possible) extra zero check. */
910 sbitmap_zero (wont_exit);
911 if (extra_zero_check && desc->cond == NE)
912 SET_BIT (wont_exit, 1);
913 ezc_swtch = loop_preheader_edge (loop)->src;
914 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
915 loops, 1,
916 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
917 DLTHE_FLAG_UPDATE_FREQ))
918 abort ();
920 /* Record the place where switch will be built for preconditioning. */
921 swtch = loop_split_edge_with (loop_preheader_edge (loop),
922 NULL_RTX);
924 for (i = 0; i < n_peel; i++)
926 /* Peel the copy. */
927 sbitmap_zero (wont_exit);
928 if (i != n_peel - 1 || !last_may_exit)
929 SET_BIT (wont_exit, 1);
930 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
931 loops, 1,
932 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
933 DLTHE_FLAG_UPDATE_FREQ))
934 abort ();
936 /* Create item for switch. */
937 j = n_peel - i - (extra_zero_check ? 0 : 1);
938 p = REG_BR_PROB_BASE / (i + 2);
940 /* If modulo is zero do not jumo to the header of the unrolled loops.
941 Jump instead to the last branch and count that precedes it. */
942 if (is_bct && discard_inc && (j == 0))
944 basic_block lastbb = loop_preheader_edge(loop)->src;
945 rtx split_after;
947 /* Skip dummy basic blocks generated during the unrolling. */
948 while (!is_bct_cond (BB_END (lastbb)))
949 lastbb = lastbb->pred->src;
951 split_after = PREV_INSN (BB_END (lastbb));
953 preheader = split_loop_bb (lastbb , split_after)->dest;
955 else
956 preheader = loop_split_edge_with (loop_preheader_edge (loop),
957 NULL_RTX);
958 label = block_label (preheader);
959 start_sequence ();
960 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
961 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
962 label);
963 jump = get_last_insn ();
964 JUMP_LABEL (jump) = label;
965 REG_NOTES (jump)
966 = gen_rtx_EXPR_LIST (REG_BR_PROB,
967 GEN_INT (p), REG_NOTES (jump));
969 LABEL_NUSES (label)++;
970 branch_code = get_insns ();
971 end_sequence ();
973 swtch = loop_split_edge_with (swtch->pred, branch_code);
974 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
975 swtch->succ->probability = REG_BR_PROB_BASE - p;
976 e = make_edge (swtch, preheader,
977 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
978 e->probability = p;
981 if (extra_zero_check)
983 /* Add branch for zero iterations. */
984 p = REG_BR_PROB_BASE / (max_unroll + 1);
985 swtch = ezc_swtch;
986 preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
987 label = block_label (preheader);
988 start_sequence ();
989 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
990 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
991 label);
992 jump = get_last_insn ();
993 JUMP_LABEL (jump) = label;
994 REG_NOTES (jump)
995 = gen_rtx_EXPR_LIST (REG_BR_PROB,
996 GEN_INT (p), REG_NOTES (jump));
998 LABEL_NUSES (label)++;
999 branch_code = get_insns ();
1000 end_sequence ();
1002 swtch = loop_split_edge_with (swtch->succ, branch_code);
1003 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1004 swtch->succ->probability = REG_BR_PROB_BASE - p;
1005 e = make_edge (swtch, preheader,
1006 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
1007 e->probability = p;
1010 /* Recount dominators for outer blocks. */
1011 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs);
1013 /* And unroll loop. */
1015 sbitmap_ones (wont_exit);
1016 RESET_BIT (wont_exit, may_exit_copy);
1018 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1019 loops, max_unroll,
1020 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
1021 DLTHE_FLAG_UPDATE_FREQ))
1022 abort ();
1024 free (wont_exit);
1026 /* Expand the branch and count. */
1027 if (is_bct)
1028 for (i = 0; i < n_remove_edges; i++)
1029 expand_bct (remove_edges[i], discard_inc);
1031 /* Remove the edges. */
1032 for (i = 0; i < n_remove_edges; i++)
1033 remove_path (loops, remove_edges[i]);
1034 free (remove_edges);
1036 if (rtl_dump_file)
1037 fprintf (rtl_dump_file,
1038 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
1039 max_unroll, num_loop_insns (loop));
1042 /* Decide whether to simply peel LOOP and how much. */
1043 static void
1044 decide_peel_simple (struct loop *loop, int flags)
1046 unsigned npeel;
1048 if (!(flags & UAP_PEEL))
1050 /* We were not asked to, just return back silently. */
1051 return;
1054 if (rtl_dump_file)
1055 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
1057 /* npeel = number of iterations to peel. */
1058 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1059 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1060 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1062 /* Skip big loops. */
1063 if (!npeel)
1065 if (rtl_dump_file)
1066 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1067 return;
1070 /* Check for simple loops. */
1071 if (!loop->has_desc)
1073 loop->simple = simple_loop_p (loop, &loop->desc);
1074 loop->has_desc = 1;
1077 /* Check number of iterations. */
1078 if (loop->simple && loop->desc.const_iter)
1080 if (rtl_dump_file)
1081 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
1082 return;
1085 /* Do not simply peel loops with branches inside -- it increases number
1086 of mispredicts. */
1087 if (loop->desc.n_branches > 1)
1089 if (rtl_dump_file)
1090 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
1091 return;
1094 if (loop->header->count)
1096 unsigned niter = expected_loop_iterations (loop);
1097 if (niter + 1 > npeel)
1099 if (rtl_dump_file)
1101 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1102 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1103 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1105 return;
1107 npeel = niter + 1;
1109 else
1111 /* For now we have no good heuristics to decide whether loop peeling
1112 will be effective, so disable it. */
1113 if (rtl_dump_file)
1114 fprintf (rtl_dump_file,
1115 ";; Not peeling loop, no evidence it will be profitable\n");
1116 return;
1119 /* Success. */
1120 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1121 loop->lpt_decision.times = npeel;
1124 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1125 while (cond)
1126 body;
1130 if (!cond) goto end;
1131 body;
1132 if (!cond) goto end;
1133 body;
1134 while (cond)
1135 body;
1136 end: ;
1138 static void
1139 peel_loop_simple (struct loops *loops, struct loop *loop)
1141 sbitmap wont_exit;
1142 unsigned npeel = loop->lpt_decision.times;
1144 wont_exit = sbitmap_alloc (npeel + 1);
1145 sbitmap_zero (wont_exit);
1147 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1148 loops, npeel, wont_exit, NULL, NULL, NULL,
1149 DLTHE_FLAG_UPDATE_FREQ))
1150 abort ();
1152 free (wont_exit);
1154 if (rtl_dump_file)
1155 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1158 /* Decide whether to unroll LOOP stupidly and how much. */
1159 static void
1160 decide_unroll_stupid (struct loop *loop, int flags)
1162 unsigned nunroll, nunroll_by_av, i;
1164 if (!(flags & UAP_UNROLL_ALL))
1166 /* We were not asked to, just return back silently. */
1167 return;
1170 if (rtl_dump_file)
1171 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1173 /* nunroll = total number of copies of the original loop body in
1174 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1175 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1176 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1177 if (nunroll > nunroll_by_av)
1178 nunroll = nunroll_by_av;
1179 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1180 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1182 /* Skip big loops. */
1183 if (nunroll <= 1)
1185 if (rtl_dump_file)
1186 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1187 return;
1190 /* Check for simple loops. */
1191 if (!loop->has_desc)
1193 loop->simple = simple_loop_p (loop, &loop->desc);
1194 loop->has_desc = 1;
1197 /* Check simpleness. */
1198 if (loop->simple)
1200 if (rtl_dump_file)
1201 fprintf (rtl_dump_file, ";; The loop is simple\n");
1202 return;
1205 /* Do not unroll loops with branches inside -- it increases number
1206 of mispredicts. */
1207 if (loop->desc.n_branches > 1)
1209 if (rtl_dump_file)
1210 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1211 return;
1214 /* If we have profile feedback, check whether the loop rolls. */
1215 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1217 if (rtl_dump_file)
1218 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1219 return;
1222 /* Success. Now force nunroll to be power of 2, as it seems that this
1223 improves results (partially because of better alignments, partially
1224 because of some dark magic). */
1225 for (i = 1; 2 * i <= nunroll; i *= 2);
1227 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1228 loop->lpt_decision.times = i - 1;
1231 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1232 while (cond)
1233 body;
1237 while (cond)
1239 body;
1240 if (!cond) break;
1241 body;
1242 if (!cond) break;
1243 body;
1244 if (!cond) break;
1245 body;
1248 static void
1249 unroll_loop_stupid (struct loops *loops, struct loop *loop)
1251 sbitmap wont_exit;
1252 unsigned nunroll = loop->lpt_decision.times;
1254 wont_exit = sbitmap_alloc (nunroll + 1);
1255 sbitmap_zero (wont_exit);
1257 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1258 loops, nunroll, wont_exit, NULL, NULL, NULL,
1259 DLTHE_FLAG_UPDATE_FREQ))
1260 abort ();
1262 free (wont_exit);
1264 if (rtl_dump_file)
1265 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1266 nunroll, num_loop_insns (loop));
1269 /* Expand a bct instruction in a branch and an increment.
1270 If flag_inc is set, the induction variable does not need to be
1271 incremented. */
1272 void expand_bct (edge e, int flag_inc)
1274 rtx bct_insn = BB_END (e->src);
1275 rtx cmp;
1276 rtx inc;
1277 rtx seq;
1279 rtx tgt;
1280 rtx condition;
1281 rtx label;
1282 rtx reg;
1283 rtx jump;
1284 rtx pattern = PATTERN(bct_insn);
1286 if (!(is_bct_cond(bct_insn)))
1287 return;
1289 inc = get_var_set_from_bct (bct_insn);
1290 cmp = XVECEXP (pattern, 0, 0);
1291 reg = SET_DEST (inc);
1293 start_sequence ();
1294 if (!flag_inc)
1296 tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0));
1297 if (tgt != XEXP (inc, 0))
1298 emit_move_insn (XEXP (inc, 0), tgt);
1301 condition = XEXP (SET_SRC (cmp), 0);
1302 label = XEXP (SET_SRC (cmp), 1);
1304 do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1),
1305 GET_CODE (condition), 0,
1306 GET_MODE (reg), NULL_RTX, NULL_RTX,
1307 label);
1308 jump = get_last_insn ();
1309 JUMP_LABEL (jump) = label;
1310 seq = get_insns ();
1311 end_sequence ();
1312 emit_insn_after (seq, bct_insn);
1314 delete_insn (bct_insn);
1316 return;
1319 /* Check that the increment of the count register can be discarded. */
1320 bool
1321 discard_increment (struct loop *loop)
1323 struct loop_desc *desc = &loop->desc;
1324 rtx inc, set_src, reg;
1325 rtx bct_insn;
1326 unsigned int i;
1327 basic_block *body;
1329 bct_insn = BB_END (desc->out_edge->src);
1330 if (!is_bct_cond (bct_insn))
1331 abort();
1333 inc = get_var_set_from_bct (bct_insn);
1335 /* Check that inc is of the form reg = reg - 1. */
1336 reg = SET_DEST (inc);
1337 set_src = SET_SRC (inc);
1339 if (GET_CODE (set_src) != PLUS)
1340 return false;
1342 if (!rtx_equal_p (XEXP (set_src, 0), reg))
1343 return false;
1345 if (!CONSTANT_P (XEXP (set_src, 1)))
1346 return false;
1348 if (INTVAL (XEXP (set_src, 1)) != -1)
1349 return false;
1351 /* We need to check that the register has no other uses beside the branch and
1352 count. */
1353 body = get_loop_body (loop);
1354 for(i=0; i < loop->num_nodes; i++)
1356 if (reg_mentioned_p (desc->var, BB_HEAD (body[i])))
1357 return false;
1359 if (body[i] != desc->out_edge->src)
1360 if (reg_mentioned_p (desc->var, BB_END (body[i])))
1361 return false;
1363 if (reg_used_between_p (desc->var, BB_HEAD (body[i]), BB_END (body[i])))
1364 return false;
1367 /* Check that the branch and count ends the latch. */
1368 if (desc->out_edge->src != loop->latch)
1370 rtx insn;
1372 /* Latch is a dummy block generated by loop-init. */
1373 if (BRANCH_EDGE(desc->out_edge->src)->dest != loop->latch)
1374 return false;
1376 for (insn = BB_HEAD (loop->latch); insn != NEXT_INSN (BB_END (loop->latch));
1377 insn = NEXT_INSN (insn))
1378 if (INSN_P (insn)) return false;
1381 return true;