gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103   unsigned accum_pos;              /* The position in which the accumulator is placed in
 104                                       the insn src.  For example in x = x + something
 105                                       accum_pos is 0 while in x = something + x accum_pos
 106                                       is 1.  */
 107 };
 108
 109 /* Information about optimization applied in
 110    the unrolled loop.  */
 111
 112 struct opt_info
 113 {
 114   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 115   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 116   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 117   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 118                                       to expand.  */
 119   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 120   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 121   unsigned first_new_block;        /* The first basic block that was
 122                                       duplicated.  */
 123   basic_block loop_exit;           /* The loop exit basic block.  */
 124   basic_block loop_preheader;      /* The loop preheader basic block.  */
 125 };
 126
 127 static void decide_unrolling_and_peeling (int);
 128 static void peel_loops_completely (int);
 129 static void decide_peel_simple (struct loop *, int);
 130 static void decide_peel_once_rolling (struct loop *, int);
 131 static void decide_peel_completely (struct loop *, int);
 132 static void decide_unroll_stupid (struct loop *, int);
 133 static void decide_unroll_constant_iterations (struct loop *, int);
 134 static void decide_unroll_runtime_iterations (struct loop *, int);
 135 static void peel_loop_simple (struct loop *);
 136 static void peel_loop_completely (struct loop *);
 137 static void unroll_loop_stupid (struct loop *);
 138 static void unroll_loop_constant_iterations (struct loop *);
 139 static void unroll_loop_runtime_iterations (struct loop *);
 140 static struct opt_info *analyze_insns_in_loop (struct loop *);
 141 static void opt_info_start_duplication (struct opt_info *);
 142 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 143 static void free_opt_info (struct opt_info *);
 144 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 145 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 146 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 147 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 148 static void insert_var_expansion_initialization (struct var_to_expand *,
 149                                                  basic_block);
 150 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 151                                              basic_block);
 152 static rtx get_expansion (struct var_to_expand *);
 153
 154 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 155 void
 156 unroll_and_peel_loops (int flags)
 157 {
 158   struct loop *loop;
 159   bool check;
 160   loop_iterator li;
 161
 162   /* First perform complete loop peeling (it is almost surely a win,
 163      and affects parameters for further decision a lot).  */
 164   peel_loops_completely (flags);
 165
 166   /* Now decide rest of unrolling and peeling.  */
 167   decide_unrolling_and_peeling (flags);
 168
 169   /* Scan the loops, inner ones first.  */
 170   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 171     {
 172       check = true;
 173       /* And perform the appropriate transformations.  */
 174       switch (loop->lpt_decision.decision)
 175         {
 176         case LPT_PEEL_COMPLETELY:
 177           /* Already done.  */
 178           gcc_unreachable ();
 179         case LPT_PEEL_SIMPLE:
 180           peel_loop_simple (loop);
 181           break;
 182         case LPT_UNROLL_CONSTANT:
 183           unroll_loop_constant_iterations (loop);
 184           break;
 185         case LPT_UNROLL_RUNTIME:
 186           unroll_loop_runtime_iterations (loop);
 187           break;
 188         case LPT_UNROLL_STUPID:
 189           unroll_loop_stupid (loop);
 190           break;
 191         case LPT_NONE:
 192           check = false;
 193           break;
 194         default:
 195           gcc_unreachable ();
 196         }
 197       if (check)
 198         {
 199 #ifdef ENABLE_CHECKING
 200           verify_dominators (CDI_DOMINATORS);
 201           verify_loop_structure ();
 202 #endif
 203         }
 204     }
 205
 206   iv_analysis_done ();
 207 }
 208
 209 /* Check whether exit of the LOOP is at the end of loop body.  */
 210
 211 static bool
 212 loop_exit_at_end_p (struct loop *loop)
 213 {
 214   struct niter_desc *desc = get_simple_loop_desc (loop);
 215   rtx insn;
 216
 217   if (desc->in_edge->dest != loop->latch)
 218     return false;
 219
 220   /* Check that the latch is empty.  */
 221   FOR_BB_INSNS (loop->latch, insn)
 222     {
 223       if (INSN_P (insn))
 224         return false;
 225     }
 226
 227   return true;
 228 }
 229
 230 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 231 static void
 232 peel_loops_completely (int flags)
 233 {
 234   struct loop *loop;
 235   loop_iterator li;
 236
 237   /* Scan the loops, the inner ones first.  */
 238   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 239     {
 240       loop->lpt_decision.decision = LPT_NONE;
 241
 242       if (dump_file)
 243         fprintf (dump_file,
 244                  "\n;; *** Considering loop %d for complete peeling ***\n",
 245                  loop->num);
 246
 247       loop->ninsns = num_loop_insns (loop);
 248
 249       decide_peel_once_rolling (loop, flags);
 250       if (loop->lpt_decision.decision == LPT_NONE)
 251         decide_peel_completely (loop, flags);
 252
 253       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254         {
 255           peel_loop_completely (loop);
 256 #ifdef ENABLE_CHECKING
 257           verify_dominators (CDI_DOMINATORS);
 258           verify_loop_structure ();
 259 #endif
 260         }
 261     }
 262 }
 263
 264 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 265 static void
 266 decide_unrolling_and_peeling (int flags)
 267 {
 268   struct loop *loop;
 269   loop_iterator li;
 270
 271   /* Scan the loops, inner ones first.  */
 272   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 273     {
 274       loop->lpt_decision.decision = LPT_NONE;
 275
 276       if (dump_file)
 277         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 278
 279       /* Do not peel cold areas.  */
 280       if (optimize_loop_for_size_p (loop))
 281         {
 282           if (dump_file)
 283             fprintf (dump_file, ";; Not considering loop, cold area\n");
 284           continue;
 285         }
 286
 287       /* Can the loop be manipulated?  */
 288       if (!can_duplicate_loop_p (loop))
 289         {
 290           if (dump_file)
 291             fprintf (dump_file,
 292                      ";; Not considering loop, cannot duplicate\n");
 293           continue;
 294         }
 295
 296       /* Skip non-innermost loops.  */
 297       if (loop->inner)
 298         {
 299           if (dump_file)
 300             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 301           continue;
 302         }
 303
 304       loop->ninsns = num_loop_insns (loop);
 305       loop->av_ninsns = average_num_loop_insns (loop);
 306
 307       /* Try transformations one by one in decreasing order of
 308          priority.  */
 309
 310       decide_unroll_constant_iterations (loop, flags);
 311       if (loop->lpt_decision.decision == LPT_NONE)
 312         decide_unroll_runtime_iterations (loop, flags);
 313       if (loop->lpt_decision.decision == LPT_NONE)
 314         decide_unroll_stupid (loop, flags);
 315       if (loop->lpt_decision.decision == LPT_NONE)
 316         decide_peel_simple (loop, flags);
 317     }
 318 }
 319
 320 /* Decide whether the LOOP is once rolling and suitable for complete
 321    peeling.  */
 322 static void
 323 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 324 {
 325   struct niter_desc *desc;
 326
 327   if (dump_file)
 328     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 329
 330   /* Is the loop small enough?  */
 331   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 332     {
 333       if (dump_file)
 334         fprintf (dump_file, ";; Not considering loop, is too big\n");
 335       return;
 336     }
 337
 338   /* Check for simple loops.  */
 339   desc = get_simple_loop_desc (loop);
 340
 341   /* Check number of iterations.  */
 342   if (!desc->simple_p
 343       || desc->assumptions
 344       || desc->infinite
 345       || !desc->const_iter
 346       || desc->niter != 0)
 347     {
 348       if (dump_file)
 349         fprintf (dump_file,
 350                  ";; Unable to prove that the loop rolls exactly once\n");
 351       return;
 352     }
 353
 354   /* Success.  */
 355   if (dump_file)
 356     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 357   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 358 }
 359
 360 /* Decide whether the LOOP is suitable for complete peeling.  */
 361 static void
 362 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 363 {
 364   unsigned npeel;
 365   struct niter_desc *desc;
 366
 367   if (dump_file)
 368     fprintf (dump_file, "\n;; Considering peeling completely\n");
 369
 370   /* Skip non-innermost loops.  */
 371   if (loop->inner)
 372     {
 373       if (dump_file)
 374         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 375       return;
 376     }
 377
 378   /* Do not peel cold areas.  */
 379   if (optimize_loop_for_size_p (loop))
 380     {
 381       if (dump_file)
 382         fprintf (dump_file, ";; Not considering loop, cold area\n");
 383       return;
 384     }
 385
 386   /* Can the loop be manipulated?  */
 387   if (!can_duplicate_loop_p (loop))
 388     {
 389       if (dump_file)
 390         fprintf (dump_file,
 391                  ";; Not considering loop, cannot duplicate\n");
 392       return;
 393     }
 394
 395   /* npeel = number of iterations to peel.  */
 396   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 397   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 398     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 399
 400   /* Is the loop small enough?  */
 401   if (!npeel)
 402     {
 403       if (dump_file)
 404         fprintf (dump_file, ";; Not considering loop, is too big\n");
 405       return;
 406     }
 407
 408   /* Check for simple loops.  */
 409   desc = get_simple_loop_desc (loop);
 410
 411   /* Check number of iterations.  */
 412   if (!desc->simple_p
 413       || desc->assumptions
 414       || !desc->const_iter
 415       || desc->infinite)
 416     {
 417       if (dump_file)
 418         fprintf (dump_file,
 419                  ";; Unable to prove that the loop iterates constant times\n");
 420       return;
 421     }
 422
 423   if (desc->niter > npeel - 1)
 424     {
 425       if (dump_file)
 426         {
 427           fprintf (dump_file,
 428                    ";; Not peeling loop completely, rolls too much (");
 429           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 430           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 431         }
 432       return;
 433     }
 434
 435   /* Success.  */
 436   if (dump_file)
 437     fprintf (dump_file, ";; Decided to peel loop completely\n");
 438   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 439 }
 440
 441 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 442    completely.  The transformation done:
 443
 444    for (i = 0; i < 4; i++)
 445      body;
 446
 447    ==>
 448
 449    i = 0;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    body; i++;
 454    */
 455 static void
 456 peel_loop_completely (struct loop *loop)
 457 {
 458   sbitmap wont_exit;
 459   unsigned HOST_WIDE_INT npeel;
 460   unsigned i;
 461   VEC (edge, heap) *remove_edges;
 462   edge ein;
 463   struct niter_desc *desc = get_simple_loop_desc (loop);
 464   struct opt_info *opt_info = NULL;
 465
 466   npeel = desc->niter;
 467
 468   if (npeel)
 469     {
 470       bool ok;
 471
 472       wont_exit = sbitmap_alloc (npeel + 1);
 473       sbitmap_ones (wont_exit);
 474       RESET_BIT (wont_exit, 0);
 475       if (desc->noloop_assumptions)
 476         RESET_BIT (wont_exit, 1);
 477
 478       remove_edges = NULL;
 479
 480       if (flag_split_ivs_in_unroller)
 481         opt_info = analyze_insns_in_loop (loop);
 482
 483       opt_info_start_duplication (opt_info);
 484       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 485                                           npeel,
 486                                           wont_exit, desc->out_edge,
 487                                           &remove_edges,
 488                                           DLTHE_FLAG_UPDATE_FREQ
 489                                           | DLTHE_FLAG_COMPLETTE_PEEL
 490                                           | (opt_info
 491                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 492       gcc_assert (ok);
 493
 494       free (wont_exit);
 495
 496       if (opt_info)
 497         {
 498           apply_opt_in_copies (opt_info, npeel, false, true);
 499           free_opt_info (opt_info);
 500         }
 501
 502       /* Remove the exit edges.  */
 503       for (i = 0; VEC_iterate (edge, remove_edges, i, ein); i++)
 504         remove_path (ein);
 505       VEC_free (edge, heap, remove_edges);
 506     }
 507
 508   ein = desc->in_edge;
 509   free_simple_loop_desc (loop);
 510
 511   /* Now remove the unreachable part of the last iteration and cancel
 512      the loop.  */
 513   remove_path (ein);
 514
 515   if (dump_file)
 516     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 517 }
 518
 519 /* Decide whether to unroll LOOP iterating constant number of times
 520    and how much.  */
 521
 522 static void
 523 decide_unroll_constant_iterations (struct loop *loop, int flags)
 524 {
 525   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 526   struct niter_desc *desc;
 527
 528   if (!(flags & UAP_UNROLL))
 529     {
 530       /* We were not asked to, just return back silently.  */
 531       return;
 532     }
 533
 534   if (dump_file)
 535     fprintf (dump_file,
 536              "\n;; Considering unrolling loop with constant "
 537              "number of iterations\n");
 538
 539   /* nunroll = total number of copies of the original loop body in
 540      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 541   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 542   nunroll_by_av
 543     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 544   if (nunroll > nunroll_by_av)
 545     nunroll = nunroll_by_av;
 546   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 547     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 548
 549   /* Skip big loops.  */
 550   if (nunroll <= 1)
 551     {
 552       if (dump_file)
 553         fprintf (dump_file, ";; Not considering loop, is too big\n");
 554       return;
 555     }
 556
 557   /* Check for simple loops.  */
 558   desc = get_simple_loop_desc (loop);
 559
 560   /* Check number of iterations.  */
 561   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 562     {
 563       if (dump_file)
 564         fprintf (dump_file,
 565                  ";; Unable to prove that the loop iterates constant times\n");
 566       return;
 567     }
 568
 569   /* Check whether the loop rolls enough to consider.  */
 570   if (desc->niter < 2 * nunroll)
 571     {
 572       if (dump_file)
 573         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 574       return;
 575     }
 576
 577   /* Success; now compute number of iterations to unroll.  We alter
 578      nunroll so that as few as possible copies of loop body are
 579      necessary, while still not decreasing the number of unrollings
 580      too much (at most by 1).  */
 581   best_copies = 2 * nunroll + 10;
 582
 583   i = 2 * nunroll + 2;
 584   if (i - 1 >= desc->niter)
 585     i = desc->niter - 2;
 586
 587   for (; i >= nunroll - 1; i--)
 588     {
 589       unsigned exit_mod = desc->niter % (i + 1);
 590
 591       if (!loop_exit_at_end_p (loop))
 592         n_copies = exit_mod + i + 1;
 593       else if (exit_mod != (unsigned) i
 594                || desc->noloop_assumptions != NULL_RTX)
 595         n_copies = exit_mod + i + 2;
 596       else
 597         n_copies = i + 1;
 598
 599       if (n_copies < best_copies)
 600         {
 601           best_copies = n_copies;
 602           best_unroll = i;
 603         }
 604     }
 605
 606   if (dump_file)
 607     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 608              best_unroll + 1, best_copies, nunroll);
 609
 610   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 611   loop->lpt_decision.times = best_unroll;
 612
 613   if (dump_file)
 614     fprintf (dump_file,
 615              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 616              loop->lpt_decision.times);
 617 }
 618
 619 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 620    times.  The transformation does this:
 621
 622    for (i = 0; i < 102; i++)
 623      body;
 624
 625    ==>
 626
 627    i = 0;
 628    body; i++;
 629    body; i++;
 630    while (i < 102)
 631      {
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635        body; i++;
 636      }
 637   */
 638 static void
 639 unroll_loop_constant_iterations (struct loop *loop)
 640 {
 641   unsigned HOST_WIDE_INT niter;
 642   unsigned exit_mod;
 643   sbitmap wont_exit;
 644   unsigned i;
 645   VEC (edge, heap) *remove_edges;
 646   edge e;
 647   unsigned max_unroll = loop->lpt_decision.times;
 648   struct niter_desc *desc = get_simple_loop_desc (loop);
 649   bool exit_at_end = loop_exit_at_end_p (loop);
 650   struct opt_info *opt_info = NULL;
 651   bool ok;
 652
 653   niter = desc->niter;
 654
 655   /* Should not get here (such loop should be peeled instead).  */
 656   gcc_assert (niter > max_unroll + 1);
 657
 658   exit_mod = niter % (max_unroll + 1);
 659
 660   wont_exit = sbitmap_alloc (max_unroll + 1);
 661   sbitmap_ones (wont_exit);
 662
 663   remove_edges = NULL;
 664   if (flag_split_ivs_in_unroller
 665       || flag_variable_expansion_in_unroller)
 666     opt_info = analyze_insns_in_loop (loop);
 667
 668   if (!exit_at_end)
 669     {
 670       /* The exit is not at the end of the loop; leave exit test
 671          in the first copy, so that the loops that start with test
 672          of exit condition have continuous body after unrolling.  */
 673
 674       if (dump_file)
 675         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 676
 677       /* Peel exit_mod iterations.  */
 678       RESET_BIT (wont_exit, 0);
 679       if (desc->noloop_assumptions)
 680         RESET_BIT (wont_exit, 1);
 681
 682       if (exit_mod)
 683         {
 684           opt_info_start_duplication (opt_info);
 685           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 686                                               exit_mod,
 687                                               wont_exit, desc->out_edge,
 688                                               &remove_edges,
 689                                               DLTHE_FLAG_UPDATE_FREQ
 690                                               | (opt_info && exit_mod > 1
 691                                                  ? DLTHE_RECORD_COPY_NUMBER
 692                                                    : 0));
 693           gcc_assert (ok);
 694
 695           if (opt_info && exit_mod > 1)
 696             apply_opt_in_copies (opt_info, exit_mod, false, false);
 697
 698           desc->noloop_assumptions = NULL_RTX;
 699           desc->niter -= exit_mod;
 700           desc->niter_max -= exit_mod;
 701         }
 702
 703       SET_BIT (wont_exit, 1);
 704     }
 705   else
 706     {
 707       /* Leave exit test in last copy, for the same reason as above if
 708          the loop tests the condition at the end of loop body.  */
 709
 710       if (dump_file)
 711         fprintf (dump_file, ";; Condition on end of loop.\n");
 712
 713       /* We know that niter >= max_unroll + 2; so we do not need to care of
 714          case when we would exit before reaching the loop.  So just peel
 715          exit_mod + 1 iterations.  */
 716       if (exit_mod != max_unroll
 717           || desc->noloop_assumptions)
 718         {
 719           RESET_BIT (wont_exit, 0);
 720           if (desc->noloop_assumptions)
 721             RESET_BIT (wont_exit, 1);
 722
 723           opt_info_start_duplication (opt_info);
 724           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 725                                               exit_mod + 1,
 726                                               wont_exit, desc->out_edge,
 727                                               &remove_edges,
 728                                               DLTHE_FLAG_UPDATE_FREQ
 729                                               | (opt_info && exit_mod > 0
 730                                                  ? DLTHE_RECORD_COPY_NUMBER
 731                                                    : 0));
 732           gcc_assert (ok);
 733
 734           if (opt_info && exit_mod > 0)
 735             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 736
 737           desc->niter -= exit_mod + 1;
 738           desc->niter_max -= exit_mod + 1;
 739           desc->noloop_assumptions = NULL_RTX;
 740
 741           SET_BIT (wont_exit, 0);
 742           SET_BIT (wont_exit, 1);
 743         }
 744
 745       RESET_BIT (wont_exit, max_unroll);
 746     }
 747
 748   /* Now unroll the loop.  */
 749
 750   opt_info_start_duplication (opt_info);
 751   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 752                                       max_unroll,
 753                                       wont_exit, desc->out_edge,
 754                                       &remove_edges,
 755                                       DLTHE_FLAG_UPDATE_FREQ
 756                                       | (opt_info
 757                                          ? DLTHE_RECORD_COPY_NUMBER
 758                                            : 0));
 759   gcc_assert (ok);
 760
 761   if (opt_info)
 762     {
 763       apply_opt_in_copies (opt_info, max_unroll, true, true);
 764       free_opt_info (opt_info);
 765     }
 766
 767   free (wont_exit);
 768
 769   if (exit_at_end)
 770     {
 771       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 772       /* Find a new in and out edge; they are in the last copy we have made.  */
 773
 774       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 775         {
 776           desc->out_edge = EDGE_SUCC (exit_block, 0);
 777           desc->in_edge = EDGE_SUCC (exit_block, 1);
 778         }
 779       else
 780         {
 781           desc->out_edge = EDGE_SUCC (exit_block, 1);
 782           desc->in_edge = EDGE_SUCC (exit_block, 0);
 783         }
 784     }
 785
 786   desc->niter /= max_unroll + 1;
 787   desc->niter_max /= max_unroll + 1;
 788   desc->niter_expr = GEN_INT (desc->niter);
 789
 790   /* Remove the edges.  */
 791   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
 792     remove_path (e);
 793   VEC_free (edge, heap, remove_edges);
 794
 795   if (dump_file)
 796     fprintf (dump_file,
 797              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 798              max_unroll, num_loop_insns (loop));
 799 }
 800
 801 /* Decide whether to unroll LOOP iterating runtime computable number of times
 802    and how much.  */
 803 static void
 804 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 805 {
 806   unsigned nunroll, nunroll_by_av, i;
 807   struct niter_desc *desc;
 808
 809   if (!(flags & UAP_UNROLL))
 810     {
 811       /* We were not asked to, just return back silently.  */
 812       return;
 813     }
 814
 815   if (dump_file)
 816     fprintf (dump_file,
 817              "\n;; Considering unrolling loop with runtime "
 818              "computable number of iterations\n");
 819
 820   /* nunroll = total number of copies of the original loop body in
 821      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 822   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 823   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 824   if (nunroll > nunroll_by_av)
 825     nunroll = nunroll_by_av;
 826   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 827     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 828
 829   /* Skip big loops.  */
 830   if (nunroll <= 1)
 831     {
 832       if (dump_file)
 833         fprintf (dump_file, ";; Not considering loop, is too big\n");
 834       return;
 835     }
 836
 837   /* Check for simple loops.  */
 838   desc = get_simple_loop_desc (loop);
 839
 840   /* Check simpleness.  */
 841   if (!desc->simple_p || desc->assumptions)
 842     {
 843       if (dump_file)
 844         fprintf (dump_file,
 845                  ";; Unable to prove that the number of iterations "
 846                  "can be counted in runtime\n");
 847       return;
 848     }
 849
 850   if (desc->const_iter)
 851     {
 852       if (dump_file)
 853         fprintf (dump_file, ";; Loop iterates constant times\n");
 854       return;
 855     }
 856
 857   /* If we have profile feedback, check whether the loop rolls.  */
 858   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 859     {
 860       if (dump_file)
 861         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 862       return;
 863     }
 864
 865   /* Success; now force nunroll to be power of 2, as we are unable to
 866      cope with overflows in computation of number of iterations.  */
 867   for (i = 1; 2 * i <= nunroll; i *= 2)
 868     continue;
 869
 870   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 871   loop->lpt_decision.times = i - 1;
 872
 873   if (dump_file)
 874     fprintf (dump_file,
 875              ";; Decided to unroll the runtime computable "
 876              "times rolling loop, %d times.\n",
 877              loop->lpt_decision.times);
 878 }
 879
 880 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 881    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 882    and NULL is returned instead.  */
 883
 884 basic_block
 885 split_edge_and_insert (edge e, rtx insns)
 886 {
 887   basic_block bb;
 888
 889   if (!insns)
 890     return NULL;
 891   bb = split_edge (e);
 892   emit_insn_after (insns, BB_END (bb));
 893
 894   /* ??? We used to assume that INSNS can contain control flow insns, and
 895      that we had to try to find sub basic blocks in BB to maintain a valid
 896      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 897      and call break_superblocks when going out of cfglayout mode.  But it
 898      turns out that this never happens; and that if it does ever happen,
 899      the verify_flow_info call in loop_optimizer_finalize would fail.
 900
 901      There are two reasons why we expected we could have control flow insns
 902      in INSNS.  The first is when a comparison has to be done in parts, and
 903      the second is when the number of iterations is computed for loops with
 904      the number of iterations known at runtime.  In both cases, test cases
 905      to get control flow in INSNS appear to be impossible to construct:
 906
 907       * If do_compare_rtx_and_jump needs several branches to do comparison
 908         in a mode that needs comparison by parts, we cannot analyze the
 909         number of iterations of the loop, and we never get to unrolling it.
 910
 911       * The code in expand_divmod that was suspected to cause creation of
 912         branching code seems to be only accessed for signed division.  The
 913         divisions used by # of iterations analysis are always unsigned.
 914         Problems might arise on architectures that emits branching code
 915         for some operations that may appear in the unroller (especially
 916         for division), but we have no such architectures.
 917
 918      Considering all this, it was decided that we should for now assume
 919      that INSNS can in theory contain control flow insns, but in practice
 920      it never does.  So we don't handle the theoretical case, and should
 921      a real failure ever show up, we have a pretty good clue for how to
 922      fix it.  */
 923
 924   return bb;
 925 }
 926
 927 /* Unroll LOOP for that we are able to count number of iterations in runtime
 928    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 929    extra care for case n < 0):
 930
 931    for (i = 0; i < n; i++)
 932      body;
 933
 934    ==>
 935
 936    i = 0;
 937    mod = n % 4;
 938
 939    switch (mod)
 940      {
 941        case 3:
 942          body; i++;
 943        case 2:
 944          body; i++;
 945        case 1:
 946          body; i++;
 947        case 0: ;
 948      }
 949
 950    while (i < n)
 951      {
 952        body; i++;
 953        body; i++;
 954        body; i++;
 955        body; i++;
 956      }
 957    */
 958 static void
 959 unroll_loop_runtime_iterations (struct loop *loop)
 960 {
 961   rtx old_niter, niter, init_code, branch_code, tmp;
 962   unsigned i, j, p;
 963   basic_block preheader, *body, swtch, ezc_swtch;
 964   VEC (basic_block, heap) *dom_bbs;
 965   sbitmap wont_exit;
 966   int may_exit_copy;
 967   unsigned n_peel;
 968   VEC (edge, heap) *remove_edges;
 969   edge e;
 970   bool extra_zero_check, last_may_exit;
 971   unsigned max_unroll = loop->lpt_decision.times;
 972   struct niter_desc *desc = get_simple_loop_desc (loop);
 973   bool exit_at_end = loop_exit_at_end_p (loop);
 974   struct opt_info *opt_info = NULL;
 975   bool ok;
 976
 977   if (flag_split_ivs_in_unroller
 978       || flag_variable_expansion_in_unroller)
 979     opt_info = analyze_insns_in_loop (loop);
 980
 981   /* Remember blocks whose dominators will have to be updated.  */
 982   dom_bbs = NULL;
 983
 984   body = get_loop_body (loop);
 985   for (i = 0; i < loop->num_nodes; i++)
 986     {
 987       VEC (basic_block, heap) *ldom;
 988       basic_block bb;
 989
 990       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 991       for (j = 0; VEC_iterate (basic_block, ldom, j, bb); j++)
 992         if (!flow_bb_inside_loop_p (loop, bb))
 993           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 994
 995       VEC_free (basic_block, heap, ldom);
 996     }
 997   free (body);
 998
 999   if (!exit_at_end)
1000     {
1001       /* Leave exit in first copy (for explanation why see comment in
1002          unroll_loop_constant_iterations).  */
1003       may_exit_copy = 0;
1004       n_peel = max_unroll - 1;
1005       extra_zero_check = true;
1006       last_may_exit = false;
1007     }
1008   else
1009     {
1010       /* Leave exit in last copy (for explanation why see comment in
1011          unroll_loop_constant_iterations).  */
1012       may_exit_copy = max_unroll;
1013       n_peel = max_unroll;
1014       extra_zero_check = false;
1015       last_may_exit = true;
1016     }
1017
1018   /* Get expression for number of iterations.  */
1019   start_sequence ();
1020   old_niter = niter = gen_reg_rtx (desc->mode);
1021   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1022   if (tmp != niter)
1023     emit_move_insn (niter, tmp);
1024
1025   /* Count modulo by ANDing it with max_unroll; we use the fact that
1026      the number of unrollings is a power of two, and thus this is correct
1027      even if there is overflow in the computation.  */
1028   niter = expand_simple_binop (desc->mode, AND,
1029                                niter,
1030                                GEN_INT (max_unroll),
1031                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1032
1033   init_code = get_insns ();
1034   end_sequence ();
1035   unshare_all_rtl_in_chain (init_code);
1036
1037   /* Precondition the loop.  */
1038   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1039
1040   remove_edges = NULL;
1041
1042   wont_exit = sbitmap_alloc (max_unroll + 2);
1043
1044   /* Peel the first copy of loop body (almost always we must leave exit test
1045      here; the only exception is when we have extra zero check and the number
1046      of iterations is reliable.  Also record the place of (possible) extra
1047      zero check.  */
1048   sbitmap_zero (wont_exit);
1049   if (extra_zero_check
1050       && !desc->noloop_assumptions)
1051     SET_BIT (wont_exit, 1);
1052   ezc_swtch = loop_preheader_edge (loop)->src;
1053   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1054                                       1, wont_exit, desc->out_edge,
1055                                       &remove_edges,
1056                                       DLTHE_FLAG_UPDATE_FREQ);
1057   gcc_assert (ok);
1058
1059   /* Record the place where switch will be built for preconditioning.  */
1060   swtch = split_edge (loop_preheader_edge (loop));
1061
1062   for (i = 0; i < n_peel; i++)
1063     {
1064       /* Peel the copy.  */
1065       sbitmap_zero (wont_exit);
1066       if (i != n_peel - 1 || !last_may_exit)
1067         SET_BIT (wont_exit, 1);
1068       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1069                                           1, wont_exit, desc->out_edge,
1070                                           &remove_edges,
1071                                           DLTHE_FLAG_UPDATE_FREQ);
1072       gcc_assert (ok);
1073
1074       /* Create item for switch.  */
1075       j = n_peel - i - (extra_zero_check ? 0 : 1);
1076       p = REG_BR_PROB_BASE / (i + 2);
1077
1078       preheader = split_edge (loop_preheader_edge (loop));
1079       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1080                                           block_label (preheader), p,
1081                                           NULL_RTX);
1082
1083       /* We rely on the fact that the compare and jump cannot be optimized out,
1084          and hence the cfg we create is correct.  */
1085       gcc_assert (branch_code != NULL_RTX);
1086
1087       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1088       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1089       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1090       e = make_edge (swtch, preheader,
1091                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1092       e->probability = p;
1093     }
1094
1095   if (extra_zero_check)
1096     {
1097       /* Add branch for zero iterations.  */
1098       p = REG_BR_PROB_BASE / (max_unroll + 1);
1099       swtch = ezc_swtch;
1100       preheader = split_edge (loop_preheader_edge (loop));
1101       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1102                                           block_label (preheader), p,
1103                                           NULL_RTX);
1104       gcc_assert (branch_code != NULL_RTX);
1105
1106       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1107       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1108       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1109       e = make_edge (swtch, preheader,
1110                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1111       e->probability = p;
1112     }
1113
1114   /* Recount dominators for outer blocks.  */
1115   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1116
1117   /* And unroll loop.  */
1118
1119   sbitmap_ones (wont_exit);
1120   RESET_BIT (wont_exit, may_exit_copy);
1121   opt_info_start_duplication (opt_info);
1122
1123   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1124                                       max_unroll,
1125                                       wont_exit, desc->out_edge,
1126                                       &remove_edges,
1127                                       DLTHE_FLAG_UPDATE_FREQ
1128                                       | (opt_info
1129                                          ? DLTHE_RECORD_COPY_NUMBER
1130                                            : 0));
1131   gcc_assert (ok);
1132
1133   if (opt_info)
1134     {
1135       apply_opt_in_copies (opt_info, max_unroll, true, true);
1136       free_opt_info (opt_info);
1137     }
1138
1139   free (wont_exit);
1140
1141   if (exit_at_end)
1142     {
1143       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1144       /* Find a new in and out edge; they are in the last copy we have
1145          made.  */
1146
1147       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1148         {
1149           desc->out_edge = EDGE_SUCC (exit_block, 0);
1150           desc->in_edge = EDGE_SUCC (exit_block, 1);
1151         }
1152       else
1153         {
1154           desc->out_edge = EDGE_SUCC (exit_block, 1);
1155           desc->in_edge = EDGE_SUCC (exit_block, 0);
1156         }
1157     }
1158
1159   /* Remove the edges.  */
1160   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
1161     remove_path (e);
1162   VEC_free (edge, heap, remove_edges);
1163
1164   /* We must be careful when updating the number of iterations due to
1165      preconditioning and the fact that the value must be valid at entry
1166      of the loop.  After passing through the above code, we see that
1167      the correct new number of iterations is this:  */
1168   gcc_assert (!desc->const_iter);
1169   desc->niter_expr =
1170     simplify_gen_binary (UDIV, desc->mode, old_niter,
1171                          GEN_INT (max_unroll + 1));
1172   desc->niter_max /= max_unroll + 1;
1173   if (exit_at_end)
1174     {
1175       desc->niter_expr =
1176         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1177       desc->noloop_assumptions = NULL_RTX;
1178       desc->niter_max--;
1179     }
1180
1181   if (dump_file)
1182     fprintf (dump_file,
1183              ";; Unrolled loop %d times, counting # of iterations "
1184              "in runtime, %i insns\n",
1185              max_unroll, num_loop_insns (loop));
1186
1187   VEC_free (basic_block, heap, dom_bbs);
1188 }
1189
1190 /* Decide whether to simply peel LOOP and how much.  */
1191 static void
1192 decide_peel_simple (struct loop *loop, int flags)
1193 {
1194   unsigned npeel;
1195   struct niter_desc *desc;
1196
1197   if (!(flags & UAP_PEEL))
1198     {
1199       /* We were not asked to, just return back silently.  */
1200       return;
1201     }
1202
1203   if (dump_file)
1204     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1205
1206   /* npeel = number of iterations to peel.  */
1207   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1208   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1209     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1210
1211   /* Skip big loops.  */
1212   if (!npeel)
1213     {
1214       if (dump_file)
1215         fprintf (dump_file, ";; Not considering loop, is too big\n");
1216       return;
1217     }
1218
1219   /* Check for simple loops.  */
1220   desc = get_simple_loop_desc (loop);
1221
1222   /* Check number of iterations.  */
1223   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1224     {
1225       if (dump_file)
1226         fprintf (dump_file, ";; Loop iterates constant times\n");
1227       return;
1228     }
1229
1230   /* Do not simply peel loops with branches inside -- it increases number
1231      of mispredicts.  */
1232   if (num_loop_branches (loop) > 1)
1233     {
1234       if (dump_file)
1235         fprintf (dump_file, ";; Not peeling, contains branches\n");
1236       return;
1237     }
1238
1239   if (loop->header->count)
1240     {
1241       unsigned niter = expected_loop_iterations (loop);
1242       if (niter + 1 > npeel)
1243         {
1244           if (dump_file)
1245             {
1246               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1247               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1248                        (HOST_WIDEST_INT) (niter + 1));
1249               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1250                        npeel);
1251             }
1252           return;
1253         }
1254       npeel = niter + 1;
1255     }
1256   else
1257     {
1258       /* For now we have no good heuristics to decide whether loop peeling
1259          will be effective, so disable it.  */
1260       if (dump_file)
1261         fprintf (dump_file,
1262                  ";; Not peeling loop, no evidence it will be profitable\n");
1263       return;
1264     }
1265
1266   /* Success.  */
1267   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1268   loop->lpt_decision.times = npeel;
1269
1270   if (dump_file)
1271     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1272              loop->lpt_decision.times);
1273 }
1274
1275 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1276    while (cond)
1277      body;
1278
1279    ==>
1280
1281    if (!cond) goto end;
1282    body;
1283    if (!cond) goto end;
1284    body;
1285    while (cond)
1286      body;
1287    end: ;
1288    */
1289 static void
1290 peel_loop_simple (struct loop *loop)
1291 {
1292   sbitmap wont_exit;
1293   unsigned npeel = loop->lpt_decision.times;
1294   struct niter_desc *desc = get_simple_loop_desc (loop);
1295   struct opt_info *opt_info = NULL;
1296   bool ok;
1297
1298   if (flag_split_ivs_in_unroller && npeel > 1)
1299     opt_info = analyze_insns_in_loop (loop);
1300
1301   wont_exit = sbitmap_alloc (npeel + 1);
1302   sbitmap_zero (wont_exit);
1303
1304   opt_info_start_duplication (opt_info);
1305
1306   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1307                                       npeel, wont_exit, NULL,
1308                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1309                                       | (opt_info
1310                                          ? DLTHE_RECORD_COPY_NUMBER
1311                                            : 0));
1312   gcc_assert (ok);
1313
1314   free (wont_exit);
1315
1316   if (opt_info)
1317     {
1318       apply_opt_in_copies (opt_info, npeel, false, false);
1319       free_opt_info (opt_info);
1320     }
1321
1322   if (desc->simple_p)
1323     {
1324       if (desc->const_iter)
1325         {
1326           desc->niter -= npeel;
1327           desc->niter_expr = GEN_INT (desc->niter);
1328           desc->noloop_assumptions = NULL_RTX;
1329         }
1330       else
1331         {
1332           /* We cannot just update niter_expr, as its value might be clobbered
1333              inside loop.  We could handle this by counting the number into
1334              temporary just like we do in runtime unrolling, but it does not
1335              seem worthwhile.  */
1336           free_simple_loop_desc (loop);
1337         }
1338     }
1339   if (dump_file)
1340     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1341 }
1342
1343 /* Decide whether to unroll LOOP stupidly and how much.  */
1344 static void
1345 decide_unroll_stupid (struct loop *loop, int flags)
1346 {
1347   unsigned nunroll, nunroll_by_av, i;
1348   struct niter_desc *desc;
1349
1350   if (!(flags & UAP_UNROLL_ALL))
1351     {
1352       /* We were not asked to, just return back silently.  */
1353       return;
1354     }
1355
1356   if (dump_file)
1357     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1358
1359   /* nunroll = total number of copies of the original loop body in
1360      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1361   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1362   nunroll_by_av
1363     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1364   if (nunroll > nunroll_by_av)
1365     nunroll = nunroll_by_av;
1366   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1367     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1368
1369   /* Skip big loops.  */
1370   if (nunroll <= 1)
1371     {
1372       if (dump_file)
1373         fprintf (dump_file, ";; Not considering loop, is too big\n");
1374       return;
1375     }
1376
1377   /* Check for simple loops.  */
1378   desc = get_simple_loop_desc (loop);
1379
1380   /* Check simpleness.  */
1381   if (desc->simple_p && !desc->assumptions)
1382     {
1383       if (dump_file)
1384         fprintf (dump_file, ";; The loop is simple\n");
1385       return;
1386     }
1387
1388   /* Do not unroll loops with branches inside -- it increases number
1389      of mispredicts.  */
1390   if (num_loop_branches (loop) > 1)
1391     {
1392       if (dump_file)
1393         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1394       return;
1395     }
1396
1397   /* If we have profile feedback, check whether the loop rolls.  */
1398   if (loop->header->count
1399       && expected_loop_iterations (loop) < 2 * nunroll)
1400     {
1401       if (dump_file)
1402         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1403       return;
1404     }
1405
1406   /* Success.  Now force nunroll to be power of 2, as it seems that this
1407      improves results (partially because of better alignments, partially
1408      because of some dark magic).  */
1409   for (i = 1; 2 * i <= nunroll; i *= 2)
1410     continue;
1411
1412   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1413   loop->lpt_decision.times = i - 1;
1414
1415   if (dump_file)
1416     fprintf (dump_file,
1417              ";; Decided to unroll the loop stupidly, %d times.\n",
1418              loop->lpt_decision.times);
1419 }
1420
1421 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1422    while (cond)
1423      body;
1424
1425    ==>
1426
1427    while (cond)
1428      {
1429        body;
1430        if (!cond) break;
1431        body;
1432        if (!cond) break;
1433        body;
1434        if (!cond) break;
1435        body;
1436      }
1437    */
1438 static void
1439 unroll_loop_stupid (struct loop *loop)
1440 {
1441   sbitmap wont_exit;
1442   unsigned nunroll = loop->lpt_decision.times;
1443   struct niter_desc *desc = get_simple_loop_desc (loop);
1444   struct opt_info *opt_info = NULL;
1445   bool ok;
1446
1447   if (flag_split_ivs_in_unroller
1448       || flag_variable_expansion_in_unroller)
1449     opt_info = analyze_insns_in_loop (loop);
1450
1451
1452   wont_exit = sbitmap_alloc (nunroll + 1);
1453   sbitmap_zero (wont_exit);
1454   opt_info_start_duplication (opt_info);
1455
1456   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1457                                       nunroll, wont_exit,
1458                                       NULL, NULL,
1459                                       DLTHE_FLAG_UPDATE_FREQ
1460                                       | (opt_info
1461                                          ? DLTHE_RECORD_COPY_NUMBER
1462                                            : 0));
1463   gcc_assert (ok);
1464
1465   if (opt_info)
1466     {
1467       apply_opt_in_copies (opt_info, nunroll, true, true);
1468       free_opt_info (opt_info);
1469     }
1470
1471   free (wont_exit);
1472
1473   if (desc->simple_p)
1474     {
1475       /* We indeed may get here provided that there are nontrivial assumptions
1476          for a loop to be really simple.  We could update the counts, but the
1477          problem is that we are unable to decide which exit will be taken
1478          (not really true in case the number of iterations is constant,
1479          but noone will do anything with this information, so we do not
1480          worry about it).  */
1481       desc->simple_p = false;
1482     }
1483
1484   if (dump_file)
1485     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1486              nunroll, num_loop_insns (loop));
1487 }
1488
1489 /* A hash function for information about insns to split.  */
1490
1491 static hashval_t
1492 si_info_hash (const void *ivts)
1493 {
1494   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1495 }
1496
1497 /* An equality functions for information about insns to split.  */
1498
1499 static int
1500 si_info_eq (const void *ivts1, const void *ivts2)
1501 {
1502   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1503   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1504
1505   return i1->insn == i2->insn;
1506 }
1507
1508 /* Return a hash for VES, which is really a "var_to_expand *".  */
1509
1510 static hashval_t
1511 ve_info_hash (const void *ves)
1512 {
1513   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1514 }
1515
1516 /* Return true if IVTS1 and IVTS2 (which are really both of type
1517    "var_to_expand *") refer to the same instruction.  */
1518
1519 static int
1520 ve_info_eq (const void *ivts1, const void *ivts2)
1521 {
1522   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1523   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1524
1525   return i1->insn == i2->insn;
1526 }
1527
1528 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1529    Set *DEBUG_USES to the number of debug insns that reference the
1530    variable.  */
1531
1532 bool
1533 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1534                                   int *debug_uses)
1535 {
1536   basic_block *body, bb;
1537   unsigned i;
1538   int count_ref = 0;
1539   rtx insn;
1540
1541   body = get_loop_body (loop);
1542   for (i = 0; i < loop->num_nodes; i++)
1543     {
1544       bb = body[i];
1545
1546       FOR_BB_INSNS (bb, insn)
1547         if (!rtx_referenced_p (reg, insn))
1548           continue;
1549         else if (DEBUG_INSN_P (insn))
1550           ++*debug_uses;
1551         else if (++count_ref > 1)
1552           break;
1553     }
1554   free (body);
1555   return (count_ref  == 1);
1556 }
1557
1558 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1559
1560 static void
1561 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1562 {
1563   basic_block *body, bb;
1564   unsigned i;
1565   rtx insn;
1566
1567   body = get_loop_body (loop);
1568   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1569     {
1570       bb = body[i];
1571
1572       FOR_BB_INSNS (bb, insn)
1573         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1574           continue;
1575         else
1576           {
1577             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1578                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1579             if (!--debug_uses)
1580               break;
1581           }
1582     }
1583   free (body);
1584 }
1585
1586 /* Determine whether INSN contains an accumulator
1587    which can be expanded into separate copies,
1588    one for each copy of the LOOP body.
1589
1590    for (i = 0 ; i < n; i++)
1591      sum += a[i];
1592
1593    ==>
1594
1595    sum += a[i]
1596    ....
1597    i = i+1;
1598    sum1 += a[i]
1599    ....
1600    i = i+1
1601    sum2 += a[i];
1602    ....
1603
1604    Return NULL if INSN contains no opportunity for expansion of accumulator.
1605    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1606    information and return a pointer to it.
1607 */
1608
1609 static struct var_to_expand *
1610 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1611 {
1612   rtx set, dest, src, op1, op2, something;
1613   struct var_to_expand *ves;
1614   enum machine_mode mode1, mode2;
1615   unsigned accum_pos;
1616   int debug_uses = 0;
1617
1618   set = single_set (insn);
1619   if (!set)
1620     return NULL;
1621
1622   dest = SET_DEST (set);
1623   src = SET_SRC (set);
1624
1625   if (GET_CODE (src) != PLUS
1626       && GET_CODE (src) != MINUS
1627       && GET_CODE (src) != MULT)
1628     return NULL;
1629
1630   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1631      in MD.  But if there is no optab to generate the insn, we can not
1632      perform the variable expansion.  This can happen if an MD provides
1633      an insn but not a named pattern to generate it, for example to avoid
1634      producing code that needs additional mode switches like for x87/mmx.
1635
1636      So we check have_insn_for which looks for an optab for the operation
1637      in SRC.  If it doesn't exist, we can't perform the expansion even
1638      though INSN is valid.  */
1639   if (!have_insn_for (GET_CODE (src), GET_MODE (src)))
1640     return NULL;
1641
1642   op1 = XEXP (src, 0);
1643   op2 = XEXP (src, 1);
1644
1645   if (!REG_P (dest)
1646       && !(GET_CODE (dest) == SUBREG
1647            && REG_P (SUBREG_REG (dest))))
1648     return NULL;
1649
1650   if (rtx_equal_p (dest, op1))
1651     accum_pos = 0;
1652   else if (rtx_equal_p (dest, op2))
1653     accum_pos = 1;
1654   else
1655     return NULL;
1656
1657   /* The method of expansion that we are using; which includes
1658      the initialization of the expansions with zero and the summation of
1659      the expansions at the end of the computation will yield wrong results
1660      for (x = something - x) thus avoid using it in that case.  */
1661   if (accum_pos == 1
1662       && GET_CODE (src) == MINUS)
1663    return NULL;
1664
1665   something = (accum_pos == 0) ? op2 : op1;
1666
1667   if (rtx_referenced_p (dest, something))
1668     return NULL;
1669
1670   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1671     return NULL;
1672
1673   mode1 = GET_MODE (dest);
1674   mode2 = GET_MODE (something);
1675   if ((FLOAT_MODE_P (mode1)
1676        || FLOAT_MODE_P (mode2))
1677       && !flag_associative_math)
1678     return NULL;
1679
1680   if (dump_file)
1681   {
1682     fprintf (dump_file,
1683     "\n;; Expanding Accumulator ");
1684     print_rtl (dump_file, dest);
1685     fprintf (dump_file, "\n");
1686   }
1687
1688   if (debug_uses)
1689     /* Instead of resetting the debug insns, we could replace each
1690        debug use in the loop with the sum or product of all expanded
1691        accummulators.  Since we'll only know of all expansions at the
1692        end, we'd have to keep track of which vars_to_expand a debug
1693        insn in the loop references, take note of each copy of the
1694        debug insn during unrolling, and when it's all done, compute
1695        the sum or product of each variable and adjust the original
1696        debug insn and each copy thereof.  What a pain!  */
1697     reset_debug_uses_in_loop (loop, dest, debug_uses);
1698
1699   /* Record the accumulator to expand.  */
1700   ves = XNEW (struct var_to_expand);
1701   ves->insn = insn;
1702   ves->reg = copy_rtx (dest);
1703   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1704   ves->next = NULL;
1705   ves->op = GET_CODE (src);
1706   ves->expansion_count = 0;
1707   ves->reuse_expansion = 0;
1708   ves->accum_pos = accum_pos;
1709   return ves;
1710 }
1711
1712 /* Determine whether there is an induction variable in INSN that
1713    we would like to split during unrolling.
1714
1715    I.e. replace
1716
1717    i = i + 1;
1718    ...
1719    i = i + 1;
1720    ...
1721    i = i + 1;
1722    ...
1723
1724    type chains by
1725
1726    i0 = i + 1
1727    ...
1728    i = i0 + 1
1729    ...
1730    i = i0 + 2
1731    ...
1732
1733    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1734    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1735    pointer to it.  */
1736
1737 static struct iv_to_split *
1738 analyze_iv_to_split_insn (rtx insn)
1739 {
1740   rtx set, dest;
1741   struct rtx_iv iv;
1742   struct iv_to_split *ivts;
1743   bool ok;
1744
1745   /* For now we just split the basic induction variables.  Later this may be
1746      extended for example by selecting also addresses of memory references.  */
1747   set = single_set (insn);
1748   if (!set)
1749     return NULL;
1750
1751   dest = SET_DEST (set);
1752   if (!REG_P (dest))
1753     return NULL;
1754
1755   if (!biv_p (insn, dest))
1756     return NULL;
1757
1758   ok = iv_analyze_result (insn, dest, &iv);
1759
1760   /* This used to be an assert under the assumption that if biv_p returns
1761      true that iv_analyze_result must also return true.  However, that
1762      assumption is not strictly correct as evidenced by pr25569.
1763
1764      Returning NULL when iv_analyze_result returns false is safe and
1765      avoids the problems in pr25569 until the iv_analyze_* routines
1766      can be fixed, which is apparently hard and time consuming
1767      according to their author.  */
1768   if (! ok)
1769     return NULL;
1770
1771   if (iv.step == const0_rtx
1772       || iv.mode != iv.extend_mode)
1773     return NULL;
1774
1775   /* Record the insn to split.  */
1776   ivts = XNEW (struct iv_to_split);
1777   ivts->insn = insn;
1778   ivts->base_var = NULL_RTX;
1779   ivts->step = iv.step;
1780   ivts->next = NULL;
1781   ivts->n_loc = 1;
1782   ivts->loc[0] = 1;
1783
1784   return ivts;
1785 }
1786
1787 /* Determines which of insns in LOOP can be optimized.
1788    Return a OPT_INFO struct with the relevant hash tables filled
1789    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1790    is undefined for the return value.  */
1791
1792 static struct opt_info *
1793 analyze_insns_in_loop (struct loop *loop)
1794 {
1795   basic_block *body, bb;
1796   unsigned i;
1797   struct opt_info *opt_info = XCNEW (struct opt_info);
1798   rtx insn;
1799   struct iv_to_split *ivts = NULL;
1800   struct var_to_expand *ves = NULL;
1801   PTR *slot1;
1802   PTR *slot2;
1803   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1804   edge exit;
1805   bool can_apply = false;
1806
1807   iv_analysis_loop_init (loop);
1808
1809   body = get_loop_body (loop);
1810
1811   if (flag_split_ivs_in_unroller)
1812     {
1813       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1814                                               si_info_hash, si_info_eq, free);
1815       opt_info->iv_to_split_head = NULL;
1816       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1817     }
1818
1819   /* Record the loop exit bb and loop preheader before the unrolling.  */
1820   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1821
1822   if (VEC_length (edge, edges) == 1)
1823     {
1824       exit = VEC_index (edge, edges, 0);
1825       if (!(exit->flags & EDGE_COMPLEX))
1826         {
1827           opt_info->loop_exit = split_edge (exit);
1828           can_apply = true;
1829         }
1830     }
1831
1832   if (flag_variable_expansion_in_unroller
1833       && can_apply)
1834     {
1835       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1836                                                         ve_info_hash,
1837                                                         ve_info_eq, free);
1838       opt_info->var_to_expand_head = NULL;
1839       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1840     }
1841
1842   for (i = 0; i < loop->num_nodes; i++)
1843     {
1844       bb = body[i];
1845       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1846         continue;
1847
1848       FOR_BB_INSNS (bb, insn)
1849       {
1850         if (!INSN_P (insn))
1851           continue;
1852
1853         if (opt_info->insns_to_split)
1854           ivts = analyze_iv_to_split_insn (insn);
1855
1856         if (ivts)
1857           {
1858             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1859             gcc_assert (*slot1 == NULL);
1860             *slot1 = ivts;
1861             *opt_info->iv_to_split_tail = ivts;
1862             opt_info->iv_to_split_tail = &ivts->next;
1863             continue;
1864           }
1865
1866         if (opt_info->insns_with_var_to_expand)
1867           ves = analyze_insn_to_expand_var (loop, insn);
1868
1869         if (ves)
1870           {
1871             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1872             gcc_assert (*slot2 == NULL);
1873             *slot2 = ves;
1874             *opt_info->var_to_expand_tail = ves;
1875             opt_info->var_to_expand_tail = &ves->next;
1876           }
1877       }
1878     }
1879
1880   VEC_free (edge, heap, edges);
1881   free (body);
1882   return opt_info;
1883 }
1884
1885 /* Called just before loop duplication.  Records start of duplicated area
1886    to OPT_INFO.  */
1887
1888 static void
1889 opt_info_start_duplication (struct opt_info *opt_info)
1890 {
1891   if (opt_info)
1892     opt_info->first_new_block = last_basic_block;
1893 }
1894
1895 /* Determine the number of iterations between initialization of the base
1896    variable and the current copy (N_COPY).  N_COPIES is the total number
1897    of newly created copies.  UNROLLING is true if we are unrolling
1898    (not peeling) the loop.  */
1899
1900 static unsigned
1901 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1902 {
1903   if (unrolling)
1904     {
1905       /* If we are unrolling, initialization is done in the original loop
1906          body (number 0).  */
1907       return n_copy;
1908     }
1909   else
1910     {
1911       /* If we are peeling, the copy in that the initialization occurs has
1912          number 1.  The original loop (number 0) is the last.  */
1913       if (n_copy)
1914         return n_copy - 1;
1915       else
1916         return n_copies;
1917     }
1918 }
1919
1920 /* Locate in EXPR the expression corresponding to the location recorded
1921    in IVTS, and return a pointer to the RTX for this location.  */
1922
1923 static rtx *
1924 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1925 {
1926   unsigned i;
1927   rtx *ret = &expr;
1928
1929   for (i = 0; i < ivts->n_loc; i++)
1930     ret = &XEXP (*ret, ivts->loc[i]);
1931
1932   return ret;
1933 }
1934
1935 /* Allocate basic variable for the induction variable chain.  */
1936
1937 static void
1938 allocate_basic_variable (struct iv_to_split *ivts)
1939 {
1940   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1941
1942   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1943 }
1944
1945 /* Insert initialization of basic variable of IVTS before INSN, taking
1946    the initial value from INSN.  */
1947
1948 static void
1949 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1950 {
1951   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1952   rtx seq;
1953
1954   start_sequence ();
1955   expr = force_operand (expr, ivts->base_var);
1956   if (expr != ivts->base_var)
1957     emit_move_insn (ivts->base_var, expr);
1958   seq = get_insns ();
1959   end_sequence ();
1960
1961   emit_insn_before (seq, insn);
1962 }
1963
1964 /* Replace the use of induction variable described in IVTS in INSN
1965    by base variable + DELTA * step.  */
1966
1967 static void
1968 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1969 {
1970   rtx expr, *loc, seq, incr, var;
1971   enum machine_mode mode = GET_MODE (ivts->base_var);
1972   rtx src, dest, set;
1973
1974   /* Construct base + DELTA * step.  */
1975   if (!delta)
1976     expr = ivts->base_var;
1977   else
1978     {
1979       incr = simplify_gen_binary (MULT, mode,
1980                                   ivts->step, gen_int_mode (delta, mode));
1981       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1982                                   ivts->base_var, incr);
1983     }
1984
1985   /* Figure out where to do the replacement.  */
1986   loc = get_ivts_expr (single_set (insn), ivts);
1987
1988   /* If we can make the replacement right away, we're done.  */
1989   if (validate_change (insn, loc, expr, 0))
1990     return;
1991
1992   /* Otherwise, force EXPR into a register and try again.  */
1993   start_sequence ();
1994   var = gen_reg_rtx (mode);
1995   expr = force_operand (expr, var);
1996   if (expr != var)
1997     emit_move_insn (var, expr);
1998   seq = get_insns ();
1999   end_sequence ();
2000   emit_insn_before (seq, insn);
2001
2002   if (validate_change (insn, loc, var, 0))
2003     return;
2004
2005   /* The last chance.  Try recreating the assignment in insn
2006      completely from scratch.  */
2007   set = single_set (insn);
2008   gcc_assert (set);
2009
2010   start_sequence ();
2011   *loc = var;
2012   src = copy_rtx (SET_SRC (set));
2013   dest = copy_rtx (SET_DEST (set));
2014   src = force_operand (src, dest);
2015   if (src != dest)
2016     emit_move_insn (dest, src);
2017   seq = get_insns ();
2018   end_sequence ();
2019
2020   emit_insn_before (seq, insn);
2021   delete_insn (insn);
2022 }
2023
2024
2025 /* Return one expansion of the accumulator recorded in struct VE.  */
2026
2027 static rtx
2028 get_expansion (struct var_to_expand *ve)
2029 {
2030   rtx reg;
2031
2032   if (ve->reuse_expansion == 0)
2033     reg = ve->reg;
2034   else
2035     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2036
2037   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2038     ve->reuse_expansion = 0;
2039   else
2040     ve->reuse_expansion++;
2041
2042   return reg;
2043 }
2044
2045
2046 /* Given INSN replace the uses of the accumulator recorded in VE
2047    with a new register.  */
2048
2049 static void
2050 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2051 {
2052   rtx new_reg, set;
2053   bool really_new_expansion = false;
2054
2055   set = single_set (insn);
2056   gcc_assert (set);
2057
2058   /* Generate a new register only if the expansion limit has not been
2059      reached.  Else reuse an already existing expansion.  */
2060   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2061     {
2062       really_new_expansion = true;
2063       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2064     }
2065   else
2066     new_reg = get_expansion (ve);
2067
2068   validate_change (insn, &SET_DEST (set), new_reg, 1);
2069   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2070
2071   if (apply_change_group ())
2072     if (really_new_expansion)
2073       {
2074         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2075         ve->expansion_count++;
2076       }
2077 }
2078
2079 /* Initialize the variable expansions in loop preheader.  PLACE is the
2080    loop-preheader basic block where the initialization of the
2081    expansions should take place.  The expansions are initialized with
2082    (-0) when the operation is plus or minus to honor sign zero.  This
2083    way we can prevent cases where the sign of the final result is
2084    effected by the sign of the expansion.  Here is an example to
2085    demonstrate this:
2086
2087    for (i = 0 ; i < n; i++)
2088      sum += something;
2089
2090    ==>
2091
2092    sum += something
2093    ....
2094    i = i+1;
2095    sum1 += something
2096    ....
2097    i = i+1
2098    sum2 += something;
2099    ....
2100
2101    When SUM is initialized with -zero and SOMETHING is also -zero; the
2102    final result of sum should be -zero thus the expansions sum1 and sum2
2103    should be initialized with -zero as well (otherwise we will get +zero
2104    as the final result).  */
2105
2106 static void
2107 insert_var_expansion_initialization (struct var_to_expand *ve,
2108                                      basic_block place)
2109 {
2110   rtx seq, var, zero_init, insn;
2111   unsigned i;
2112   enum machine_mode mode = GET_MODE (ve->reg);
2113   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2114
2115   if (VEC_length (rtx, ve->var_expansions) == 0)
2116     return;
2117
2118   start_sequence ();
2119   if (ve->op == PLUS || ve->op == MINUS)
2120     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2121       {
2122         if (honor_signed_zero_p)
2123           zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2124         else
2125           zero_init = CONST0_RTX (mode);
2126
2127         emit_move_insn (var, zero_init);
2128       }
2129   else if (ve->op == MULT)
2130     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2131       {
2132         zero_init =  CONST1_RTX (GET_MODE (var));
2133         emit_move_insn (var, zero_init);
2134       }
2135
2136   seq = get_insns ();
2137   end_sequence ();
2138
2139   insn = BB_HEAD (place);
2140   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2141     insn = NEXT_INSN (insn);
2142
2143   emit_insn_after (seq, insn);
2144 }
2145
2146 /* Combine the variable expansions at the loop exit.  PLACE is the
2147    loop exit basic block where the summation of the expansions should
2148    take place.  */
2149
2150 static void
2151 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2152 {
2153   rtx sum = ve->reg;
2154   rtx expr, seq, var, insn;
2155   unsigned i;
2156
2157   if (VEC_length (rtx, ve->var_expansions) == 0)
2158     return;
2159
2160   start_sequence ();
2161   if (ve->op == PLUS || ve->op == MINUS)
2162     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2163       {
2164         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg),
2165                                    var, sum);
2166       }
2167   else if (ve->op == MULT)
2168     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2169       {
2170         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg),
2171                                    var, sum);
2172       }
2173
2174   expr = force_operand (sum, ve->reg);
2175   if (expr != ve->reg)
2176     emit_move_insn (ve->reg, expr);
2177   seq = get_insns ();
2178   end_sequence ();
2179
2180   insn = BB_HEAD (place);
2181   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2182     insn = NEXT_INSN (insn);
2183
2184   emit_insn_after (seq, insn);
2185 }
2186
2187 /* Apply loop optimizations in loop copies using the
2188    data which gathered during the unrolling.  Structure
2189    OPT_INFO record that data.
2190
2191    UNROLLING is true if we unrolled (not peeled) the loop.
2192    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2193    the loop (as it should happen in complete unrolling, but not in ordinary
2194    peeling of the loop).  */
2195
2196 static void
2197 apply_opt_in_copies (struct opt_info *opt_info,
2198                      unsigned n_copies, bool unrolling,
2199                      bool rewrite_original_loop)
2200 {
2201   unsigned i, delta;
2202   basic_block bb, orig_bb;
2203   rtx insn, orig_insn, next;
2204   struct iv_to_split ivts_templ, *ivts;
2205   struct var_to_expand ve_templ, *ves;
2206
2207   /* Sanity check -- we need to put initialization in the original loop
2208      body.  */
2209   gcc_assert (!unrolling || rewrite_original_loop);
2210
2211   /* Allocate the basic variables (i0).  */
2212   if (opt_info->insns_to_split)
2213     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2214       allocate_basic_variable (ivts);
2215
2216   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2217     {
2218       bb = BASIC_BLOCK (i);
2219       orig_bb = get_bb_original (bb);
2220
2221       /* bb->aux holds position in copy sequence initialized by
2222          duplicate_loop_to_header_edge.  */
2223       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2224                                         unrolling);
2225       bb->aux = 0;
2226       orig_insn = BB_HEAD (orig_bb);
2227       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2228         {
2229           next = NEXT_INSN (insn);
2230           if (!INSN_P (insn))
2231             continue;
2232
2233           while (!INSN_P (orig_insn))
2234             orig_insn = NEXT_INSN (orig_insn);
2235
2236           ivts_templ.insn = orig_insn;
2237           ve_templ.insn = orig_insn;
2238
2239           /* Apply splitting iv optimization.  */
2240           if (opt_info->insns_to_split)
2241             {
2242               ivts = (struct iv_to_split *)
2243                 htab_find (opt_info->insns_to_split, &ivts_templ);
2244
2245               if (ivts)
2246                 {
2247                   gcc_assert (GET_CODE (PATTERN (insn))
2248                               == GET_CODE (PATTERN (orig_insn)));
2249
2250                   if (!delta)
2251                     insert_base_initialization (ivts, insn);
2252                   split_iv (ivts, insn, delta);
2253                 }
2254             }
2255           /* Apply variable expansion optimization.  */
2256           if (unrolling && opt_info->insns_with_var_to_expand)
2257             {
2258               ves = (struct var_to_expand *)
2259                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2260               if (ves)
2261                 {
2262                   gcc_assert (GET_CODE (PATTERN (insn))
2263                               == GET_CODE (PATTERN (orig_insn)));
2264                   expand_var_during_unrolling (ves, insn);
2265                 }
2266             }
2267           orig_insn = NEXT_INSN (orig_insn);
2268         }
2269     }
2270
2271   if (!rewrite_original_loop)
2272     return;
2273
2274   /* Initialize the variable expansions in the loop preheader
2275      and take care of combining them at the loop exit.  */
2276   if (opt_info->insns_with_var_to_expand)
2277     {
2278       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2279         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2280       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2281         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2282     }
2283
2284   /* Rewrite also the original loop body.  Find them as originals of the blocks
2285      in the last copied iteration, i.e. those that have
2286      get_bb_copy (get_bb_original (bb)) == bb.  */
2287   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2288     {
2289       bb = BASIC_BLOCK (i);
2290       orig_bb = get_bb_original (bb);
2291       if (get_bb_copy (orig_bb) != bb)
2292         continue;
2293
2294       delta = determine_split_iv_delta (0, n_copies, unrolling);
2295       for (orig_insn = BB_HEAD (orig_bb);
2296            orig_insn != NEXT_INSN (BB_END (bb));
2297            orig_insn = next)
2298         {
2299           next = NEXT_INSN (orig_insn);
2300
2301           if (!INSN_P (orig_insn))
2302             continue;
2303
2304           ivts_templ.insn = orig_insn;
2305           if (opt_info->insns_to_split)
2306             {
2307               ivts = (struct iv_to_split *)
2308                 htab_find (opt_info->insns_to_split, &ivts_templ);
2309               if (ivts)
2310                 {
2311                   if (!delta)
2312                     insert_base_initialization (ivts, orig_insn);
2313                   split_iv (ivts, orig_insn, delta);
2314                   continue;
2315                 }
2316             }
2317
2318         }
2319     }
2320 }
2321
2322 /* Release OPT_INFO.  */
2323
2324 static void
2325 free_opt_info (struct opt_info *opt_info)
2326 {
2327   if (opt_info->insns_to_split)
2328     htab_delete (opt_info->insns_to_split);
2329   if (opt_info->insns_with_var_to_expand)
2330     {
2331       struct var_to_expand *ves;
2332
2333       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2334         VEC_free (rtx, heap, ves->var_expansions);
2335       htab_delete (opt_info->insns_with_var_to_expand);
2336     }
2337   free (opt_info);
2338 }