* configure.in: Pass a computed --program-transform-name
[official-gcc.git] / gcc / loop-unroll.c
blobdde3c7439af8bb8c6abab27b46345e1724d3ac0e
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
28 #include "cfgloop.h"
29 #include "cfglayout.h"
30 #include "params.h"
31 #include "output.h"
32 #include "expr.h"
34 /* This pass performs loop unrolling and peeling. We only perform these
35 optimizations on innermost loops (with single exception) because
36 the impact on performance is greatest here, and we want to avoid
37 unnecessary code size growth. The gain is caused by greater sequentiality
38 of code, better code to optimize for further passes and in some cases
39 by fewer testings of exit conditions. The main problem is code growth,
40 that impacts performance negatively due to effect of caches.
42 What we do:
44 -- complete peeling of once-rolling loops; this is the above mentioned
45 exception, as this causes loop to be cancelled completely and
46 does not cause code growth
47 -- complete peeling of loops that roll (small) constant times.
48 -- simple peeling of first iterations of loops that do not roll much
49 (according to profile feedback)
50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
61 There is a lot of parameters (defined and described in params.def) that
62 control how much we unroll/peel.
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
69 static void decide_unrolling_and_peeling (struct loops *, int);
70 static void peel_loops_completely (struct loops *, int);
71 static void decide_peel_simple (struct loops *, struct loop *, int);
72 static void decide_peel_once_rolling (struct loops *, struct loop *, int);
73 static void decide_peel_completely (struct loops *, struct loop *, int);
74 static void decide_unroll_stupid (struct loops *, struct loop *, int);
75 static void decide_unroll_constant_iterations (struct loops *,
76 struct loop *, int);
77 static void decide_unroll_runtime_iterations (struct loops *, struct loop *,
78 int);
79 static void peel_loop_simple (struct loops *, struct loop *);
80 static void peel_loop_completely (struct loops *, struct loop *);
81 static void unroll_loop_stupid (struct loops *, struct loop *);
82 static void unroll_loop_constant_iterations (struct loops *, struct loop *);
83 static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
85 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
86 void
87 unroll_and_peel_loops (struct loops *loops, int flags)
89 struct loop *loop, *next;
90 int check;
92 /* First perform complete loop peeling (it is almost surely a win,
93 and affects parameters for further decision a lot). */
94 peel_loops_completely (loops, flags);
96 /* Now decide rest of unrolling and peeling. */
97 decide_unrolling_and_peeling (loops, flags);
99 loop = loops->tree_root;
100 while (loop->inner)
101 loop = loop->inner;
103 /* Scan the loops, inner ones first. */
104 while (loop != loops->tree_root)
106 if (loop->next)
108 next = loop->next;
109 while (next->inner)
110 next = next->inner;
112 else
113 next = loop->outer;
115 check = 1;
116 /* And perform the appropriate transformations. */
117 switch (loop->lpt_decision.decision)
119 case LPT_PEEL_COMPLETELY:
120 /* Already done. */
121 abort ();
122 case LPT_PEEL_SIMPLE:
123 peel_loop_simple (loops, loop);
124 break;
125 case LPT_UNROLL_CONSTANT:
126 unroll_loop_constant_iterations (loops, loop);
127 break;
128 case LPT_UNROLL_RUNTIME:
129 unroll_loop_runtime_iterations (loops, loop);
130 break;
131 case LPT_UNROLL_STUPID:
132 unroll_loop_stupid (loops, loop);
133 break;
134 case LPT_NONE:
135 check = 0;
136 break;
137 default:
138 abort ();
140 if (check)
142 #ifdef ENABLE_CHECKING
143 verify_dominators (loops->cfg.dom);
144 verify_loop_structure (loops);
145 #endif
147 loop = next;
151 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
152 static void
153 peel_loops_completely (struct loops *loops, int flags)
155 struct loop *loop, *next;
157 loop = loops->tree_root;
158 while (loop->inner)
159 loop = loop->inner;
161 while (loop != loops->tree_root)
163 if (loop->next)
165 next = loop->next;
166 while (next->inner)
167 next = next->inner;
169 else
170 next = loop->outer;
172 loop->lpt_decision.decision = LPT_NONE;
173 loop->has_desc = 0;
175 if (rtl_dump_file)
176 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
177 loop->num);
179 loop->ninsns = num_loop_insns (loop);
181 decide_peel_once_rolling (loops, loop, flags);
182 if (loop->lpt_decision.decision == LPT_NONE)
183 decide_peel_completely (loops, loop, flags);
185 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
187 peel_loop_completely (loops, loop);
188 #ifdef ENABLE_CHECKING
189 verify_dominators (loops->cfg.dom);
190 verify_loop_structure (loops);
191 #endif
193 loop = next;
197 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
198 static void
199 decide_unrolling_and_peeling (struct loops *loops, int flags)
201 struct loop *loop = loops->tree_root, *next;
203 while (loop->inner)
204 loop = loop->inner;
206 /* Scan the loops, inner ones first. */
207 while (loop != loops->tree_root)
209 if (loop->next)
211 next = loop->next;
212 while (next->inner)
213 next = next->inner;
215 else
216 next = loop->outer;
218 loop->lpt_decision.decision = LPT_NONE;
220 if (rtl_dump_file)
221 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
223 /* Do not peel cold areas. */
224 if (!maybe_hot_bb_p (loop->header))
226 if (rtl_dump_file)
227 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
228 loop = next;
229 continue;
232 /* Can the loop be manipulated? */
233 if (!can_duplicate_loop_p (loop))
235 if (rtl_dump_file)
236 fprintf (rtl_dump_file,
237 ";; Not considering loop, cannot duplicate\n");
238 loop = next;
239 continue;
242 /* Skip non-innermost loops. */
243 if (loop->inner)
245 if (rtl_dump_file)
246 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
247 loop = next;
248 continue;
251 loop->ninsns = num_loop_insns (loop);
252 loop->av_ninsns = average_num_loop_insns (loop);
254 /* Try transformations one by one in decreasing order of
255 priority. */
257 decide_unroll_constant_iterations (loops, loop, flags);
258 if (loop->lpt_decision.decision == LPT_NONE)
259 decide_unroll_runtime_iterations (loops, loop, flags);
260 if (loop->lpt_decision.decision == LPT_NONE)
261 decide_unroll_stupid (loops, loop, flags);
262 if (loop->lpt_decision.decision == LPT_NONE)
263 decide_peel_simple (loops, loop, flags);
265 loop = next;
269 /* Decide whether the LOOP is once rolling and suitable for complete
270 peeling. */
271 static void
272 decide_peel_once_rolling (struct loops *loops, struct loop *loop,
273 int flags ATTRIBUTE_UNUSED)
275 if (rtl_dump_file)
276 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
278 /* Is the loop small enough? */
279 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
281 if (rtl_dump_file)
282 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
283 return;
286 /* Check for simple loops. */
287 loop->simple = simple_loop_p (loops, loop, &loop->desc);
288 loop->has_desc = 1;
290 /* Check number of iterations. */
291 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0)
293 if (rtl_dump_file)
294 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
295 return;
298 /* Success. */
299 if (rtl_dump_file)
300 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
301 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
304 /* Decide whether the LOOP is suitable for complete peeling. */
305 static void
306 decide_peel_completely (struct loops *loops, struct loop *loop,
307 int flags ATTRIBUTE_UNUSED)
309 unsigned npeel;
311 if (rtl_dump_file)
312 fprintf (rtl_dump_file, ";; Considering peeling completely\n");
314 /* Skip non-innermost loops. */
315 if (loop->inner)
317 if (rtl_dump_file)
318 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
319 return;
322 /* Do not peel cold areas. */
323 if (!maybe_hot_bb_p (loop->header))
325 if (rtl_dump_file)
326 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
327 return;
330 /* Can the loop be manipulated? */
331 if (!can_duplicate_loop_p (loop))
333 if (rtl_dump_file)
334 fprintf (rtl_dump_file,
335 ";; Not considering loop, cannot duplicate\n");
336 return;
339 /* npeel = number of iterations to peel. */
340 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
341 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
342 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
344 /* Is the loop small enough? */
345 if (!npeel)
347 if (rtl_dump_file)
348 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
349 return;
352 /* Check for simple loops. */
353 if (!loop->has_desc)
355 loop->simple = simple_loop_p (loops, loop, &loop->desc);
356 loop->has_desc = 1;
359 /* Check number of iterations. */
360 if (!loop->simple || !loop->desc.const_iter)
362 if (rtl_dump_file)
363 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
364 return;
367 if (loop->desc.niter > npeel - 1)
369 if (rtl_dump_file)
371 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
372 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
373 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
375 return;
378 /* Success. */
379 if (rtl_dump_file)
380 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
381 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
384 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
385 completely. The transformation done:
387 for (i = 0; i < 4; i++)
388 body;
392 i = 0;
393 body; i++;
394 body; i++;
395 body; i++;
396 body; i++;
398 static void
399 peel_loop_completely (struct loops *loops, struct loop *loop)
401 sbitmap wont_exit;
402 unsigned HOST_WIDE_INT npeel;
403 unsigned n_remove_edges, i;
404 edge *remove_edges;
405 struct loop_desc *desc = &loop->desc;
407 npeel = desc->niter;
409 if (npeel)
411 wont_exit = sbitmap_alloc (npeel + 1);
412 sbitmap_ones (wont_exit);
413 RESET_BIT (wont_exit, 0);
414 if (desc->may_be_zero)
415 RESET_BIT (wont_exit, 1);
417 remove_edges = xcalloc (npeel, sizeof (edge));
418 n_remove_edges = 0;
420 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
421 loops, npeel,
422 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
423 DLTHE_FLAG_UPDATE_FREQ))
424 abort ();
426 free (wont_exit);
428 /* Remove the exit edges. */
429 for (i = 0; i < n_remove_edges; i++)
430 remove_path (loops, remove_edges[i]);
431 free (remove_edges);
434 /* Now remove the unreachable part of the last iteration and cancel
435 the loop. */
436 remove_path (loops, desc->in_edge);
438 if (rtl_dump_file)
439 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
442 /* Decide whether to unroll LOOP iterating constant number of times and how much. */
443 static void
444 decide_unroll_constant_iterations (struct loops *loops, struct loop *loop,
445 int flags)
447 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
449 if (!(flags & UAP_UNROLL))
451 /* We were not asked to, just return back silently. */
452 return;
455 if (rtl_dump_file)
456 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
458 /* nunroll = total number of copies of the original loop body in
459 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
460 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
461 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
462 if (nunroll > nunroll_by_av)
463 nunroll = nunroll_by_av;
464 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
465 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
467 /* Skip big loops. */
468 if (nunroll <= 1)
470 if (rtl_dump_file)
471 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
472 return;
475 /* Check for simple loops. */
476 if (!loop->has_desc)
478 loop->simple = simple_loop_p (loops, loop, &loop->desc);
479 loop->has_desc = 1;
482 /* Check number of iterations. */
483 if (!loop->simple || !loop->desc.const_iter)
485 if (rtl_dump_file)
486 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
487 return;
490 /* Check whether the loop rolls enough to consider. */
491 if (loop->desc.niter < 2 * nunroll)
493 if (rtl_dump_file)
494 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
495 return;
498 /* Success; now compute number of iterations to unroll. We alter
499 nunroll so that as few as possible copies of loop body are
500 necessary, while still not decreasing the number of unrollings
501 too much (at most by 1). */
502 best_copies = 2 * nunroll + 10;
504 i = 2 * nunroll + 2;
505 if ((unsigned) i - 1 >= loop->desc.niter)
506 i = loop->desc.niter - 2;
508 for (; i >= nunroll - 1; i--)
510 unsigned exit_mod = loop->desc.niter % (i + 1);
512 if (loop->desc.postincr)
513 n_copies = exit_mod + i + 1;
514 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
515 n_copies = exit_mod + i + 2;
516 else
517 n_copies = i + 1;
519 if (n_copies < best_copies)
521 best_copies = n_copies;
522 best_unroll = i;
526 if (rtl_dump_file)
527 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
528 best_unroll + 1, best_copies, nunroll);
530 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
531 loop->lpt_decision.times = best_unroll;
534 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
535 times. The transformation does this:
537 for (i = 0; i < 102; i++)
538 body;
542 i = 0;
543 body; i++;
544 body; i++;
545 while (i < 102)
547 body; i++;
548 body; i++;
549 body; i++;
550 body; i++;
553 static void
554 unroll_loop_constant_iterations (struct loops *loops, struct loop *loop)
556 unsigned HOST_WIDE_INT niter;
557 unsigned exit_mod;
558 sbitmap wont_exit;
559 unsigned n_remove_edges, i;
560 edge *remove_edges;
561 unsigned max_unroll = loop->lpt_decision.times;
562 struct loop_desc *desc = &loop->desc;
564 niter = desc->niter;
566 if (niter <= (unsigned) max_unroll + 1)
567 abort (); /* Should not get here (such loop should be peeled instead). */
569 exit_mod = niter % (max_unroll + 1);
571 wont_exit = sbitmap_alloc (max_unroll + 1);
572 sbitmap_ones (wont_exit);
574 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
575 n_remove_edges = 0;
577 if (desc->postincr)
579 /* Counter is incremented after the exit test; leave exit test
580 in the first copy, so that the loops that start with test
581 of exit condition have continuous body after unrolling. */
583 if (rtl_dump_file)
584 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
586 /* Peel exit_mod iterations. */
587 RESET_BIT (wont_exit, 0);
588 if (desc->may_be_zero)
589 RESET_BIT (wont_exit, 1);
591 if (exit_mod
592 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
593 loops, exit_mod,
594 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
595 DLTHE_FLAG_UPDATE_FREQ))
596 abort ();
598 SET_BIT (wont_exit, 1);
600 else
602 /* Leave exit test in last copy, for the same reason as above if
603 the loop tests the condition at the end of loop body. */
605 if (rtl_dump_file)
606 fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
608 /* We know that niter >= max_unroll + 2; so we do not need to care of
609 case when we would exit before reaching the loop. So just peel
610 exit_mod + 1 iterations.
612 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
614 RESET_BIT (wont_exit, 0);
615 if (desc->may_be_zero)
616 RESET_BIT (wont_exit, 1);
618 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
619 loops, exit_mod + 1,
620 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
621 DLTHE_FLAG_UPDATE_FREQ))
622 abort ();
624 SET_BIT (wont_exit, 0);
625 SET_BIT (wont_exit, 1);
628 RESET_BIT (wont_exit, max_unroll);
631 /* Now unroll the loop. */
632 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
633 loops, max_unroll,
634 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
635 DLTHE_FLAG_UPDATE_FREQ))
636 abort ();
638 free (wont_exit);
640 /* Remove the edges. */
641 for (i = 0; i < n_remove_edges; i++)
642 remove_path (loops, remove_edges[i]);
643 free (remove_edges);
645 if (rtl_dump_file)
646 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
649 /* Decide whether to unroll LOOP iterating runtime computable number of times
650 and how much. */
651 static void
652 decide_unroll_runtime_iterations (struct loops *loops, struct loop *loop,
653 int flags)
655 unsigned nunroll, nunroll_by_av, i;
657 if (!(flags & UAP_UNROLL))
659 /* We were not asked to, just return back silently. */
660 return;
663 if (rtl_dump_file)
664 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
666 /* nunroll = total number of copies of the original loop body in
667 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
668 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
669 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
670 if (nunroll > nunroll_by_av)
671 nunroll = nunroll_by_av;
672 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
673 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
675 /* Skip big loops. */
676 if (nunroll <= 1)
678 if (rtl_dump_file)
679 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
680 return;
683 /* Check for simple loops. */
684 if (!loop->has_desc)
686 loop->simple = simple_loop_p (loops, loop, &loop->desc);
687 loop->has_desc = 1;
690 /* Check simpleness. */
691 if (!loop->simple)
693 if (rtl_dump_file)
694 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
695 return;
698 if (loop->desc.const_iter)
700 if (rtl_dump_file)
701 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
702 return;
705 /* If we have profile feedback, check whether the loop rolls. */
706 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
708 if (rtl_dump_file)
709 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
710 return;
713 /* Success; now force nunroll to be power of 2, as we are unable to
714 cope with overflows in computation of number of iterations. */
715 for (i = 1; 2 * i <= nunroll; i *= 2);
717 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
718 loop->lpt_decision.times = i - 1;
721 /* Unroll LOOP for that we are able to count number of iterations in runtime
722 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
723 extra care for case n < 0):
725 for (i = 0; i < n; i++)
726 body;
730 i = 0;
731 mod = n % 4;
733 switch (mod)
735 case 3:
736 body; i++;
737 case 2:
738 body; i++;
739 case 1:
740 body; i++;
741 case 0: ;
744 while (i < n)
746 body; i++;
747 body; i++;
748 body; i++;
749 body; i++;
752 static void
753 unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop)
755 rtx niter, init_code, branch_code, jump, label;
756 unsigned i, j, p;
757 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
758 unsigned n_dom_bbs;
759 sbitmap wont_exit;
760 int may_exit_copy;
761 unsigned n_peel, n_remove_edges;
762 edge *remove_edges, e;
763 bool extra_zero_check, last_may_exit;
764 unsigned max_unroll = loop->lpt_decision.times;
765 struct loop_desc *desc = &loop->desc;
767 /* Remember blocks whose dominators will have to be updated. */
768 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
769 n_dom_bbs = 0;
771 body = get_loop_body (loop);
772 for (i = 0; i < loop->num_nodes; i++)
774 unsigned nldom;
775 basic_block *ldom;
777 nldom = get_dominated_by (loops->cfg.dom, body[i], &ldom);
778 for (j = 0; j < nldom; j++)
779 if (!flow_bb_inside_loop_p (loop, ldom[j]))
780 dom_bbs[n_dom_bbs++] = ldom[j];
782 free (ldom);
784 free (body);
786 if (desc->postincr)
788 /* Leave exit in first copy (for explanation why see comment in
789 unroll_loop_constant_iterations). */
790 may_exit_copy = 0;
791 n_peel = max_unroll - 1;
792 extra_zero_check = true;
793 last_may_exit = false;
795 else
797 /* Leave exit in last copy (for explanation why see comment in
798 unroll_loop_constant_iterations). */
799 may_exit_copy = max_unroll;
800 n_peel = max_unroll;
801 extra_zero_check = false;
802 last_may_exit = true;
805 /* Get expression for number of iterations. */
806 start_sequence ();
807 niter = count_loop_iterations (desc, NULL, NULL);
808 if (!niter)
809 abort ();
810 niter = force_operand (niter, NULL);
812 /* Count modulo by ANDing it with max_unroll; we use the fact that
813 the number of unrollings is a power of two, and thus this is correct
814 even if there is overflow in the computation. */
815 niter = expand_simple_binop (GET_MODE (desc->var), AND,
816 niter,
817 GEN_INT (max_unroll),
818 NULL_RTX, 0, OPTAB_LIB_WIDEN);
820 init_code = get_insns ();
821 end_sequence ();
823 /* Precondition the loop. */
824 loop_split_edge_with (loop_preheader_edge (loop), init_code, loops);
826 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
827 n_remove_edges = 0;
829 wont_exit = sbitmap_alloc (max_unroll + 2);
831 /* Peel the first copy of loop body (almost always we must leave exit test
832 here; the only exception is when we have extra zero check and the number
833 of iterations is reliable (i.e. comes out of NE condition). Also record
834 the place of (possible) extra zero check. */
835 sbitmap_zero (wont_exit);
836 if (extra_zero_check && desc->cond == NE)
837 SET_BIT (wont_exit, 1);
838 ezc_swtch = loop_preheader_edge (loop)->src;
839 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
840 loops, 1,
841 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
842 DLTHE_FLAG_UPDATE_FREQ))
843 abort ();
845 /* Record the place where switch will be built for preconditioning. */
846 swtch = loop_split_edge_with (loop_preheader_edge (loop),
847 NULL_RTX, loops);
849 for (i = 0; i < n_peel; i++)
851 /* Peel the copy. */
852 sbitmap_zero (wont_exit);
853 if (i != n_peel - 1 || !last_may_exit)
854 SET_BIT (wont_exit, 1);
855 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
856 loops, 1,
857 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
858 DLTHE_FLAG_UPDATE_FREQ))
859 abort ();
861 /* Create item for switch. */
862 j = n_peel - i - (extra_zero_check ? 0 : 1);
863 p = REG_BR_PROB_BASE / (i + 2);
865 preheader = loop_split_edge_with (loop_preheader_edge (loop),
866 NULL_RTX, loops);
867 label = block_label (preheader);
868 start_sequence ();
869 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
870 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
871 label);
872 jump = get_last_insn ();
873 JUMP_LABEL (jump) = label;
874 REG_NOTES (jump)
875 = gen_rtx_EXPR_LIST (REG_BR_PROB,
876 GEN_INT (p), REG_NOTES (jump));
878 LABEL_NUSES (label)++;
879 branch_code = get_insns ();
880 end_sequence ();
882 swtch = loop_split_edge_with (swtch->pred, branch_code, loops);
883 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
884 swtch->succ->probability = REG_BR_PROB_BASE - p;
885 e = make_edge (swtch, preheader,
886 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
887 e->probability = p;
890 if (extra_zero_check)
892 /* Add branch for zero iterations. */
893 p = REG_BR_PROB_BASE / (max_unroll + 1);
894 swtch = ezc_swtch;
895 preheader = loop_split_edge_with (loop_preheader_edge (loop),
896 NULL_RTX, loops);
897 label = block_label (preheader);
898 start_sequence ();
899 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
900 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
901 label);
902 jump = get_last_insn ();
903 JUMP_LABEL (jump) = label;
904 REG_NOTES (jump)
905 = gen_rtx_EXPR_LIST (REG_BR_PROB,
906 GEN_INT (p), REG_NOTES (jump));
908 LABEL_NUSES (label)++;
909 branch_code = get_insns ();
910 end_sequence ();
912 swtch = loop_split_edge_with (swtch->succ, branch_code, loops);
913 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
914 swtch->succ->probability = REG_BR_PROB_BASE - p;
915 e = make_edge (swtch, preheader,
916 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
917 e->probability = p;
920 /* Recount dominators for outer blocks. */
921 iterate_fix_dominators (loops->cfg.dom, dom_bbs, n_dom_bbs);
923 /* And unroll loop. */
925 sbitmap_ones (wont_exit);
926 RESET_BIT (wont_exit, may_exit_copy);
928 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
929 loops, max_unroll,
930 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
931 DLTHE_FLAG_UPDATE_FREQ))
932 abort ();
934 free (wont_exit);
936 /* Remove the edges. */
937 for (i = 0; i < n_remove_edges; i++)
938 remove_path (loops, remove_edges[i]);
939 free (remove_edges);
941 if (rtl_dump_file)
942 fprintf (rtl_dump_file,
943 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
944 max_unroll, num_loop_insns (loop));
947 /* Decide whether to simply peel LOOP and how much. */
948 static void
949 decide_peel_simple (struct loops *loops, struct loop *loop, int flags)
951 unsigned npeel;
953 if (!(flags & UAP_PEEL))
955 /* We were not asked to, just return back silently. */
956 return;
959 if (rtl_dump_file)
960 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
962 /* npeel = number of iterations to peel. */
963 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
964 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
965 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
967 /* Skip big loops. */
968 if (!npeel)
970 if (rtl_dump_file)
971 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
972 return;
975 /* Check for simple loops. */
976 if (!loop->has_desc)
978 loop->simple = simple_loop_p (loops, loop, &loop->desc);
979 loop->has_desc = 1;
982 /* Check number of iterations. */
983 if (loop->simple && loop->desc.const_iter)
985 if (rtl_dump_file)
986 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
987 return;
990 /* Do not simply peel loops with branches inside -- it increases number
991 of mispredicts. */
992 if (loop->desc.n_branches > 1)
994 if (rtl_dump_file)
995 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
996 return;
999 if (loop->header->count)
1001 unsigned niter = expected_loop_iterations (loop);
1002 if (niter + 1 > npeel)
1004 if (rtl_dump_file)
1006 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1007 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1008 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1010 return;
1012 npeel = niter + 1;
1014 else
1016 /* For now we have no good heuristics to decide whether loop peeling
1017 will be effective, so disable it. */
1018 if (rtl_dump_file)
1019 fprintf (rtl_dump_file,
1020 ";; Not peeling loop, no evidence it will be profitable\n");
1021 return;
1024 /* Success. */
1025 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1026 loop->lpt_decision.times = npeel;
1029 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1030 while (cond)
1031 body;
1035 if (!cond) goto end;
1036 body;
1037 if (!cond) goto end;
1038 body;
1039 while (cond)
1040 body;
1041 end: ;
1043 static void
1044 peel_loop_simple (struct loops *loops, struct loop *loop)
1046 sbitmap wont_exit;
1047 unsigned npeel = loop->lpt_decision.times;
1049 wont_exit = sbitmap_alloc (npeel + 1);
1050 sbitmap_zero (wont_exit);
1052 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1053 loops, npeel, wont_exit, NULL, NULL, NULL,
1054 DLTHE_FLAG_UPDATE_FREQ))
1055 abort ();
1057 free (wont_exit);
1059 if (rtl_dump_file)
1060 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1063 /* Decide whether to unroll LOOP stupidly and how much. */
1064 static void
1065 decide_unroll_stupid (struct loops *loops, struct loop *loop, int flags)
1067 unsigned nunroll, nunroll_by_av, i;
1069 if (!(flags & UAP_UNROLL_ALL))
1071 /* We were not asked to, just return back silently. */
1072 return;
1075 if (rtl_dump_file)
1076 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1078 /* nunroll = total number of copies of the original loop body in
1079 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1080 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1081 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1082 if (nunroll > nunroll_by_av)
1083 nunroll = nunroll_by_av;
1084 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1085 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1087 /* Skip big loops. */
1088 if (nunroll <= 1)
1090 if (rtl_dump_file)
1091 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1092 return;
1095 /* Check for simple loops. */
1096 if (!loop->has_desc)
1098 loop->simple = simple_loop_p (loops, loop, &loop->desc);
1099 loop->has_desc = 1;
1102 /* Check simpleness. */
1103 if (loop->simple)
1105 if (rtl_dump_file)
1106 fprintf (rtl_dump_file, ";; The loop is simple\n");
1107 return;
1110 /* Do not unroll loops with branches inside -- it increases number
1111 of mispredicts. */
1112 if (loop->desc.n_branches > 1)
1114 if (rtl_dump_file)
1115 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1116 return;
1119 /* If we have profile feedback, check whether the loop rolls. */
1120 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1122 if (rtl_dump_file)
1123 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1124 return;
1127 /* Success. Now force nunroll to be power of 2, as it seems that this
1128 improves results (partially because of better alignments, partially
1129 because of some dark magic). */
1130 for (i = 1; 2 * i <= nunroll; i *= 2);
1132 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1133 loop->lpt_decision.times = i - 1;
1136 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1137 while (cond)
1138 body;
1142 while (cond)
1144 body;
1145 if (!cond) break;
1146 body;
1147 if (!cond) break;
1148 body;
1149 if (!cond) break;
1150 body;
1153 static void
1154 unroll_loop_stupid (struct loops *loops, struct loop *loop)
1156 sbitmap wont_exit;
1157 unsigned nunroll = loop->lpt_decision.times;
1159 wont_exit = sbitmap_alloc (nunroll + 1);
1160 sbitmap_zero (wont_exit);
1162 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1163 loops, nunroll, wont_exit, NULL, NULL, NULL,
1164 DLTHE_FLAG_UPDATE_FREQ))
1165 abort ();
1167 free (wont_exit);
1169 if (rtl_dump_file)
1170 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1171 nunroll, num_loop_insns (loop));