gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36 #include "target.h"
  37
  38 /* This pass performs loop unrolling and peeling.  We only perform these
  39    optimizations on innermost loops (with single exception) because
  40    the impact on performance is greatest here, and we want to avoid
  41    unnecessary code size growth.  The gain is caused by greater sequentiality
  42    of code, better code to optimize for further passes and in some cases
  43    by fewer testings of exit conditions.  The main problem is code growth,
  44    that impacts performance negatively due to effect of caches.
  45
  46    What we do:
  47
  48    -- complete peeling of once-rolling loops; this is the above mentioned
  49       exception, as this causes loop to be cancelled completely and
  50       does not cause code growth
  51    -- complete peeling of loops that roll (small) constant times.
  52    -- simple peeling of first iterations of loops that do not roll much
  53       (according to profile feedback)
  54    -- unrolling of loops that roll constant times; this is almost always
  55       win, as we get rid of exit condition tests.
  56    -- unrolling of loops that roll number of times that we can compute
  57       in runtime; we also get rid of exit condition tests here, but there
  58       is the extra expense for calculating the number of iterations
  59    -- simple unrolling of remaining loops; this is performed only if we
  60       are asked to, as the gain is questionable in this case and often
  61       it may even slow down the code
  62    For more detailed descriptions of each of those, see comments at
  63    appropriate function below.
  64
  65    There is a lot of parameters (defined and described in params.def) that
  66    control how much we unroll/peel.
  67
  68    ??? A great problem is that we don't have a good way how to determine
  69    how many times we should unroll the loop; the experiments I have made
  70    showed that this choice may affect performance in order of several %.
  71    */
  72
  73 /* Information about induction variables to split.  */
  74
  75 struct iv_to_split
  76 {
  77   rtx insn;             /* The insn in that the induction variable occurs.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104   unsigned accum_pos;              /* The position in which the accumulator is placed in
 105                                       the insn src.  For example in x = x + something
 106                                       accum_pos is 0 while in x = something + x accum_pos
 107                                       is 1.  */
 108 };
 109
 110 /* Information about optimization applied in
 111    the unrolled loop.  */
 112
 113 struct opt_info
 114 {
 115   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 116   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 117   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 118   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 119                                       to expand.  */
 120   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 121   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 122   unsigned first_new_block;        /* The first basic block that was
 123                                       duplicated.  */
 124   basic_block loop_exit;           /* The loop exit basic block.  */
 125   basic_block loop_preheader;      /* The loop preheader basic block.  */
 126 };
 127
 128 static void decide_unrolling_and_peeling (int);
 129 static void peel_loops_completely (int);
 130 static void decide_peel_simple (struct loop *, int);
 131 static void decide_peel_once_rolling (struct loop *, int);
 132 static void decide_peel_completely (struct loop *, int);
 133 static void decide_unroll_stupid (struct loop *, int);
 134 static void decide_unroll_constant_iterations (struct loop *, int);
 135 static void decide_unroll_runtime_iterations (struct loop *, int);
 136 static void peel_loop_simple (struct loop *);
 137 static void peel_loop_completely (struct loop *);
 138 static void unroll_loop_stupid (struct loop *);
 139 static void unroll_loop_constant_iterations (struct loop *);
 140 static void unroll_loop_runtime_iterations (struct loop *);
 141 static struct opt_info *analyze_insns_in_loop (struct loop *);
 142 static void opt_info_start_duplication (struct opt_info *);
 143 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 144 static void free_opt_info (struct opt_info *);
 145 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 146 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 147 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 148 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 149 static void insert_var_expansion_initialization (struct var_to_expand *,
 150                                                  basic_block);
 151 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 152                                              basic_block);
 153 static rtx get_expansion (struct var_to_expand *);
 154
 155 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 156 void
 157 unroll_and_peel_loops (int flags)
 158 {
 159   struct loop *loop;
 160   bool check;
 161   loop_iterator li;
 162
 163   /* First perform complete loop peeling (it is almost surely a win,
 164      and affects parameters for further decision a lot).  */
 165   peel_loops_completely (flags);
 166
 167   /* Now decide rest of unrolling and peeling.  */
 168   decide_unrolling_and_peeling (flags);
 169
 170   /* Scan the loops, inner ones first.  */
 171   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 172     {
 173       check = true;
 174       /* And perform the appropriate transformations.  */
 175       switch (loop->lpt_decision.decision)
 176         {
 177         case LPT_PEEL_COMPLETELY:
 178           /* Already done.  */
 179           gcc_unreachable ();
 180         case LPT_PEEL_SIMPLE:
 181           peel_loop_simple (loop);
 182           break;
 183         case LPT_UNROLL_CONSTANT:
 184           unroll_loop_constant_iterations (loop);
 185           break;
 186         case LPT_UNROLL_RUNTIME:
 187           unroll_loop_runtime_iterations (loop);
 188           break;
 189         case LPT_UNROLL_STUPID:
 190           unroll_loop_stupid (loop);
 191           break;
 192         case LPT_NONE:
 193           check = false;
 194           break;
 195         default:
 196           gcc_unreachable ();
 197         }
 198       if (check)
 199         {
 200 #ifdef ENABLE_CHECKING
 201           verify_loop_structure ();
 202 #endif
 203         }
 204     }
 205
 206   iv_analysis_done ();
 207 }
 208
 209 /* Check whether exit of the LOOP is at the end of loop body.  */
 210
 211 static bool
 212 loop_exit_at_end_p (struct loop *loop)
 213 {
 214   struct niter_desc *desc = get_simple_loop_desc (loop);
 215   rtx insn;
 216
 217   if (desc->in_edge->dest != loop->latch)
 218     return false;
 219
 220   /* Check that the latch is empty.  */
 221   FOR_BB_INSNS (loop->latch, insn)
 222     {
 223       if (INSN_P (insn))
 224         return false;
 225     }
 226
 227   return true;
 228 }
 229
 230 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 231 static void
 232 peel_loops_completely (int flags)
 233 {
 234   struct loop *loop;
 235   loop_iterator li;
 236
 237   /* Scan the loops, the inner ones first.  */
 238   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 239     {
 240       loop->lpt_decision.decision = LPT_NONE;
 241
 242       if (dump_file)
 243         fprintf (dump_file,
 244                  "\n;; *** Considering loop %d for complete peeling ***\n",
 245                  loop->num);
 246
 247       loop->ninsns = num_loop_insns (loop);
 248
 249       decide_peel_once_rolling (loop, flags);
 250       if (loop->lpt_decision.decision == LPT_NONE)
 251         decide_peel_completely (loop, flags);
 252
 253       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254         {
 255           peel_loop_completely (loop);
 256 #ifdef ENABLE_CHECKING
 257           verify_loop_structure ();
 258 #endif
 259         }
 260     }
 261 }
 262
 263 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 264 static void
 265 decide_unrolling_and_peeling (int flags)
 266 {
 267   struct loop *loop;
 268   loop_iterator li;
 269
 270   /* Scan the loops, inner ones first.  */
 271   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 272     {
 273       loop->lpt_decision.decision = LPT_NONE;
 274
 275       if (dump_file)
 276         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 277
 278       /* Do not peel cold areas.  */
 279       if (optimize_loop_for_size_p (loop))
 280         {
 281           if (dump_file)
 282             fprintf (dump_file, ";; Not considering loop, cold area\n");
 283           continue;
 284         }
 285
 286       /* Can the loop be manipulated?  */
 287       if (!can_duplicate_loop_p (loop))
 288         {
 289           if (dump_file)
 290             fprintf (dump_file,
 291                      ";; Not considering loop, cannot duplicate\n");
 292           continue;
 293         }
 294
 295       /* Skip non-innermost loops.  */
 296       if (loop->inner)
 297         {
 298           if (dump_file)
 299             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 300           continue;
 301         }
 302
 303       loop->ninsns = num_loop_insns (loop);
 304       loop->av_ninsns = average_num_loop_insns (loop);
 305
 306       /* Try transformations one by one in decreasing order of
 307          priority.  */
 308
 309       decide_unroll_constant_iterations (loop, flags);
 310       if (loop->lpt_decision.decision == LPT_NONE)
 311         decide_unroll_runtime_iterations (loop, flags);
 312       if (loop->lpt_decision.decision == LPT_NONE)
 313         decide_unroll_stupid (loop, flags);
 314       if (loop->lpt_decision.decision == LPT_NONE)
 315         decide_peel_simple (loop, flags);
 316     }
 317 }
 318
 319 /* Decide whether the LOOP is once rolling and suitable for complete
 320    peeling.  */
 321 static void
 322 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 323 {
 324   struct niter_desc *desc;
 325
 326   if (dump_file)
 327     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 328
 329   /* Is the loop small enough?  */
 330   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 331     {
 332       if (dump_file)
 333         fprintf (dump_file, ";; Not considering loop, is too big\n");
 334       return;
 335     }
 336
 337   /* Check for simple loops.  */
 338   desc = get_simple_loop_desc (loop);
 339
 340   /* Check number of iterations.  */
 341   if (!desc->simple_p
 342       || desc->assumptions
 343       || desc->infinite
 344       || !desc->const_iter
 345       || desc->niter != 0)
 346     {
 347       if (dump_file)
 348         fprintf (dump_file,
 349                  ";; Unable to prove that the loop rolls exactly once\n");
 350       return;
 351     }
 352
 353   /* Success.  */
 354   if (dump_file)
 355     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 356   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 357 }
 358
 359 /* Decide whether the LOOP is suitable for complete peeling.  */
 360 static void
 361 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 362 {
 363   unsigned npeel;
 364   struct niter_desc *desc;
 365
 366   if (dump_file)
 367     fprintf (dump_file, "\n;; Considering peeling completely\n");
 368
 369   /* Skip non-innermost loops.  */
 370   if (loop->inner)
 371     {
 372       if (dump_file)
 373         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 374       return;
 375     }
 376
 377   /* Do not peel cold areas.  */
 378   if (optimize_loop_for_size_p (loop))
 379     {
 380       if (dump_file)
 381         fprintf (dump_file, ";; Not considering loop, cold area\n");
 382       return;
 383     }
 384
 385   /* Can the loop be manipulated?  */
 386   if (!can_duplicate_loop_p (loop))
 387     {
 388       if (dump_file)
 389         fprintf (dump_file,
 390                  ";; Not considering loop, cannot duplicate\n");
 391       return;
 392     }
 393
 394   /* npeel = number of iterations to peel.  */
 395   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 396   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 397     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 398
 399   /* Is the loop small enough?  */
 400   if (!npeel)
 401     {
 402       if (dump_file)
 403         fprintf (dump_file, ";; Not considering loop, is too big\n");
 404       return;
 405     }
 406
 407   /* Check for simple loops.  */
 408   desc = get_simple_loop_desc (loop);
 409
 410   /* Check number of iterations.  */
 411   if (!desc->simple_p
 412       || desc->assumptions
 413       || !desc->const_iter
 414       || desc->infinite)
 415     {
 416       if (dump_file)
 417         fprintf (dump_file,
 418                  ";; Unable to prove that the loop iterates constant times\n");
 419       return;
 420     }
 421
 422   if (desc->niter > npeel - 1)
 423     {
 424       if (dump_file)
 425         {
 426           fprintf (dump_file,
 427                    ";; Not peeling loop completely, rolls too much (");
 428           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 429           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 430         }
 431       return;
 432     }
 433
 434   /* Success.  */
 435   if (dump_file)
 436     fprintf (dump_file, ";; Decided to peel loop completely\n");
 437   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 438 }
 439
 440 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 441    completely.  The transformation done:
 442
 443    for (i = 0; i < 4; i++)
 444      body;
 445
 446    ==>
 447
 448    i = 0;
 449    body; i++;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    */
 454 static void
 455 peel_loop_completely (struct loop *loop)
 456 {
 457   sbitmap wont_exit;
 458   unsigned HOST_WIDE_INT npeel;
 459   unsigned i;
 460   VEC (edge, heap) *remove_edges;
 461   edge ein;
 462   struct niter_desc *desc = get_simple_loop_desc (loop);
 463   struct opt_info *opt_info = NULL;
 464
 465   npeel = desc->niter;
 466
 467   if (npeel)
 468     {
 469       bool ok;
 470
 471       wont_exit = sbitmap_alloc (npeel + 1);
 472       sbitmap_ones (wont_exit);
 473       RESET_BIT (wont_exit, 0);
 474       if (desc->noloop_assumptions)
 475         RESET_BIT (wont_exit, 1);
 476
 477       remove_edges = NULL;
 478
 479       if (flag_split_ivs_in_unroller)
 480         opt_info = analyze_insns_in_loop (loop);
 481
 482       opt_info_start_duplication (opt_info);
 483       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 484                                           npeel,
 485                                           wont_exit, desc->out_edge,
 486                                           &remove_edges,
 487                                           DLTHE_FLAG_UPDATE_FREQ
 488                                           | DLTHE_FLAG_COMPLETTE_PEEL
 489                                           | (opt_info
 490                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 491       gcc_assert (ok);
 492
 493       free (wont_exit);
 494
 495       if (opt_info)
 496         {
 497           apply_opt_in_copies (opt_info, npeel, false, true);
 498           free_opt_info (opt_info);
 499         }
 500
 501       /* Remove the exit edges.  */
 502       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 503         remove_path (ein);
 504       VEC_free (edge, heap, remove_edges);
 505     }
 506
 507   ein = desc->in_edge;
 508   free_simple_loop_desc (loop);
 509
 510   /* Now remove the unreachable part of the last iteration and cancel
 511      the loop.  */
 512   remove_path (ein);
 513
 514   if (dump_file)
 515     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 516 }
 517
 518 /* Decide whether to unroll LOOP iterating constant number of times
 519    and how much.  */
 520
 521 static void
 522 decide_unroll_constant_iterations (struct loop *loop, int flags)
 523 {
 524   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 525   struct niter_desc *desc;
 526
 527   if (!(flags & UAP_UNROLL))
 528     {
 529       /* We were not asked to, just return back silently.  */
 530       return;
 531     }
 532
 533   if (dump_file)
 534     fprintf (dump_file,
 535              "\n;; Considering unrolling loop with constant "
 536              "number of iterations\n");
 537
 538   /* nunroll = total number of copies of the original loop body in
 539      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 540   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 541   nunroll_by_av
 542     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 543   if (nunroll > nunroll_by_av)
 544     nunroll = nunroll_by_av;
 545   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 546     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 547
 548   /* Skip big loops.  */
 549   if (nunroll <= 1)
 550     {
 551       if (dump_file)
 552         fprintf (dump_file, ";; Not considering loop, is too big\n");
 553       return;
 554     }
 555
 556   /* Check for simple loops.  */
 557   desc = get_simple_loop_desc (loop);
 558
 559   /* Check number of iterations.  */
 560   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 561     {
 562       if (dump_file)
 563         fprintf (dump_file,
 564                  ";; Unable to prove that the loop iterates constant times\n");
 565       return;
 566     }
 567
 568   /* Check whether the loop rolls enough to consider.  */
 569   if (desc->niter < 2 * nunroll)
 570     {
 571       if (dump_file)
 572         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 573       return;
 574     }
 575
 576   /* Success; now compute number of iterations to unroll.  We alter
 577      nunroll so that as few as possible copies of loop body are
 578      necessary, while still not decreasing the number of unrollings
 579      too much (at most by 1).  */
 580   best_copies = 2 * nunroll + 10;
 581
 582   i = 2 * nunroll + 2;
 583   if (i - 1 >= desc->niter)
 584     i = desc->niter - 2;
 585
 586   for (; i >= nunroll - 1; i--)
 587     {
 588       unsigned exit_mod = desc->niter % (i + 1);
 589
 590       if (!loop_exit_at_end_p (loop))
 591         n_copies = exit_mod + i + 1;
 592       else if (exit_mod != (unsigned) i
 593                || desc->noloop_assumptions != NULL_RTX)
 594         n_copies = exit_mod + i + 2;
 595       else
 596         n_copies = i + 1;
 597
 598       if (n_copies < best_copies)
 599         {
 600           best_copies = n_copies;
 601           best_unroll = i;
 602         }
 603     }
 604
 605   if (dump_file)
 606     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 607              best_unroll + 1, best_copies, nunroll);
 608
 609   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 610   loop->lpt_decision.times = best_unroll;
 611
 612   if (dump_file)
 613     fprintf (dump_file,
 614              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 615              loop->lpt_decision.times);
 616 }
 617
 618 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 619    times.  The transformation does this:
 620
 621    for (i = 0; i < 102; i++)
 622      body;
 623
 624    ==>
 625
 626    i = 0;
 627    body; i++;
 628    body; i++;
 629    while (i < 102)
 630      {
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635      }
 636   */
 637 static void
 638 unroll_loop_constant_iterations (struct loop *loop)
 639 {
 640   unsigned HOST_WIDE_INT niter;
 641   unsigned exit_mod;
 642   sbitmap wont_exit;
 643   unsigned i;
 644   VEC (edge, heap) *remove_edges;
 645   edge e;
 646   unsigned max_unroll = loop->lpt_decision.times;
 647   struct niter_desc *desc = get_simple_loop_desc (loop);
 648   bool exit_at_end = loop_exit_at_end_p (loop);
 649   struct opt_info *opt_info = NULL;
 650   bool ok;
 651
 652   niter = desc->niter;
 653
 654   /* Should not get here (such loop should be peeled instead).  */
 655   gcc_assert (niter > max_unroll + 1);
 656
 657   exit_mod = niter % (max_unroll + 1);
 658
 659   wont_exit = sbitmap_alloc (max_unroll + 1);
 660   sbitmap_ones (wont_exit);
 661
 662   remove_edges = NULL;
 663   if (flag_split_ivs_in_unroller
 664       || flag_variable_expansion_in_unroller)
 665     opt_info = analyze_insns_in_loop (loop);
 666
 667   if (!exit_at_end)
 668     {
 669       /* The exit is not at the end of the loop; leave exit test
 670          in the first copy, so that the loops that start with test
 671          of exit condition have continuous body after unrolling.  */
 672
 673       if (dump_file)
 674         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 675
 676       /* Peel exit_mod iterations.  */
 677       RESET_BIT (wont_exit, 0);
 678       if (desc->noloop_assumptions)
 679         RESET_BIT (wont_exit, 1);
 680
 681       if (exit_mod)
 682         {
 683           opt_info_start_duplication (opt_info);
 684           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 685                                               exit_mod,
 686                                               wont_exit, desc->out_edge,
 687                                               &remove_edges,
 688                                               DLTHE_FLAG_UPDATE_FREQ
 689                                               | (opt_info && exit_mod > 1
 690                                                  ? DLTHE_RECORD_COPY_NUMBER
 691                                                    : 0));
 692           gcc_assert (ok);
 693
 694           if (opt_info && exit_mod > 1)
 695             apply_opt_in_copies (opt_info, exit_mod, false, false);
 696
 697           desc->noloop_assumptions = NULL_RTX;
 698           desc->niter -= exit_mod;
 699           desc->niter_max -= exit_mod;
 700         }
 701
 702       SET_BIT (wont_exit, 1);
 703     }
 704   else
 705     {
 706       /* Leave exit test in last copy, for the same reason as above if
 707          the loop tests the condition at the end of loop body.  */
 708
 709       if (dump_file)
 710         fprintf (dump_file, ";; Condition on end of loop.\n");
 711
 712       /* We know that niter >= max_unroll + 2; so we do not need to care of
 713          case when we would exit before reaching the loop.  So just peel
 714          exit_mod + 1 iterations.  */
 715       if (exit_mod != max_unroll
 716           || desc->noloop_assumptions)
 717         {
 718           RESET_BIT (wont_exit, 0);
 719           if (desc->noloop_assumptions)
 720             RESET_BIT (wont_exit, 1);
 721
 722           opt_info_start_duplication (opt_info);
 723           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 724                                               exit_mod + 1,
 725                                               wont_exit, desc->out_edge,
 726                                               &remove_edges,
 727                                               DLTHE_FLAG_UPDATE_FREQ
 728                                               | (opt_info && exit_mod > 0
 729                                                  ? DLTHE_RECORD_COPY_NUMBER
 730                                                    : 0));
 731           gcc_assert (ok);
 732
 733           if (opt_info && exit_mod > 0)
 734             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 735
 736           desc->niter -= exit_mod + 1;
 737           desc->niter_max -= exit_mod + 1;
 738           desc->noloop_assumptions = NULL_RTX;
 739
 740           SET_BIT (wont_exit, 0);
 741           SET_BIT (wont_exit, 1);
 742         }
 743
 744       RESET_BIT (wont_exit, max_unroll);
 745     }
 746
 747   /* Now unroll the loop.  */
 748
 749   opt_info_start_duplication (opt_info);
 750   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 751                                       max_unroll,
 752                                       wont_exit, desc->out_edge,
 753                                       &remove_edges,
 754                                       DLTHE_FLAG_UPDATE_FREQ
 755                                       | (opt_info
 756                                          ? DLTHE_RECORD_COPY_NUMBER
 757                                            : 0));
 758   gcc_assert (ok);
 759
 760   if (opt_info)
 761     {
 762       apply_opt_in_copies (opt_info, max_unroll, true, true);
 763       free_opt_info (opt_info);
 764     }
 765
 766   free (wont_exit);
 767
 768   if (exit_at_end)
 769     {
 770       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 771       /* Find a new in and out edge; they are in the last copy we have made.  */
 772
 773       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 774         {
 775           desc->out_edge = EDGE_SUCC (exit_block, 0);
 776           desc->in_edge = EDGE_SUCC (exit_block, 1);
 777         }
 778       else
 779         {
 780           desc->out_edge = EDGE_SUCC (exit_block, 1);
 781           desc->in_edge = EDGE_SUCC (exit_block, 0);
 782         }
 783     }
 784
 785   desc->niter /= max_unroll + 1;
 786   desc->niter_max /= max_unroll + 1;
 787   desc->niter_expr = GEN_INT (desc->niter);
 788
 789   /* Remove the edges.  */
 790   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 791     remove_path (e);
 792   VEC_free (edge, heap, remove_edges);
 793
 794   if (dump_file)
 795     fprintf (dump_file,
 796              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 797              max_unroll, num_loop_insns (loop));
 798 }
 799
 800 /* Decide whether to unroll LOOP iterating runtime computable number of times
 801    and how much.  */
 802 static void
 803 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 804 {
 805   unsigned nunroll, nunroll_by_av, i;
 806   struct niter_desc *desc;
 807
 808   if (!(flags & UAP_UNROLL))
 809     {
 810       /* We were not asked to, just return back silently.  */
 811       return;
 812     }
 813
 814   if (dump_file)
 815     fprintf (dump_file,
 816              "\n;; Considering unrolling loop with runtime "
 817              "computable number of iterations\n");
 818
 819   /* nunroll = total number of copies of the original loop body in
 820      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 821   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 822   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 823   if (nunroll > nunroll_by_av)
 824     nunroll = nunroll_by_av;
 825   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 826     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 827
 828   if (targetm.loop_unroll_adjust)
 829     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 830
 831   /* Skip big loops.  */
 832   if (nunroll <= 1)
 833     {
 834       if (dump_file)
 835         fprintf (dump_file, ";; Not considering loop, is too big\n");
 836       return;
 837     }
 838
 839   /* Check for simple loops.  */
 840   desc = get_simple_loop_desc (loop);
 841
 842   /* Check simpleness.  */
 843   if (!desc->simple_p || desc->assumptions)
 844     {
 845       if (dump_file)
 846         fprintf (dump_file,
 847                  ";; Unable to prove that the number of iterations "
 848                  "can be counted in runtime\n");
 849       return;
 850     }
 851
 852   if (desc->const_iter)
 853     {
 854       if (dump_file)
 855         fprintf (dump_file, ";; Loop iterates constant times\n");
 856       return;
 857     }
 858
 859   /* If we have profile feedback, check whether the loop rolls.  */
 860   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 861     {
 862       if (dump_file)
 863         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 864       return;
 865     }
 866
 867   /* Success; now force nunroll to be power of 2, as we are unable to
 868      cope with overflows in computation of number of iterations.  */
 869   for (i = 1; 2 * i <= nunroll; i *= 2)
 870     continue;
 871
 872   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 873   loop->lpt_decision.times = i - 1;
 874
 875   if (dump_file)
 876     fprintf (dump_file,
 877              ";; Decided to unroll the runtime computable "
 878              "times rolling loop, %d times.\n",
 879              loop->lpt_decision.times);
 880 }
 881
 882 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 883    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 884    and NULL is returned instead.  */
 885
 886 basic_block
 887 split_edge_and_insert (edge e, rtx insns)
 888 {
 889   basic_block bb;
 890
 891   if (!insns)
 892     return NULL;
 893   bb = split_edge (e);
 894   emit_insn_after (insns, BB_END (bb));
 895
 896   /* ??? We used to assume that INSNS can contain control flow insns, and
 897      that we had to try to find sub basic blocks in BB to maintain a valid
 898      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 899      and call break_superblocks when going out of cfglayout mode.  But it
 900      turns out that this never happens; and that if it does ever happen,
 901      the TODO_verify_flow at the end of the RTL loop passes would fail.
 902
 903      There are two reasons why we expected we could have control flow insns
 904      in INSNS.  The first is when a comparison has to be done in parts, and
 905      the second is when the number of iterations is computed for loops with
 906      the number of iterations known at runtime.  In both cases, test cases
 907      to get control flow in INSNS appear to be impossible to construct:
 908
 909       * If do_compare_rtx_and_jump needs several branches to do comparison
 910         in a mode that needs comparison by parts, we cannot analyze the
 911         number of iterations of the loop, and we never get to unrolling it.
 912
 913       * The code in expand_divmod that was suspected to cause creation of
 914         branching code seems to be only accessed for signed division.  The
 915         divisions used by # of iterations analysis are always unsigned.
 916         Problems might arise on architectures that emits branching code
 917         for some operations that may appear in the unroller (especially
 918         for division), but we have no such architectures.
 919
 920      Considering all this, it was decided that we should for now assume
 921      that INSNS can in theory contain control flow insns, but in practice
 922      it never does.  So we don't handle the theoretical case, and should
 923      a real failure ever show up, we have a pretty good clue for how to
 924      fix it.  */
 925
 926   return bb;
 927 }
 928
 929 /* Unroll LOOP for that we are able to count number of iterations in runtime
 930    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 931    extra care for case n < 0):
 932
 933    for (i = 0; i < n; i++)
 934      body;
 935
 936    ==>
 937
 938    i = 0;
 939    mod = n % 4;
 940
 941    switch (mod)
 942      {
 943        case 3:
 944          body; i++;
 945        case 2:
 946          body; i++;
 947        case 1:
 948          body; i++;
 949        case 0: ;
 950      }
 951
 952    while (i < n)
 953      {
 954        body; i++;
 955        body; i++;
 956        body; i++;
 957        body; i++;
 958      }
 959    */
 960 static void
 961 unroll_loop_runtime_iterations (struct loop *loop)
 962 {
 963   rtx old_niter, niter, init_code, branch_code, tmp;
 964   unsigned i, j, p;
 965   basic_block preheader, *body, swtch, ezc_swtch;
 966   VEC (basic_block, heap) *dom_bbs;
 967   sbitmap wont_exit;
 968   int may_exit_copy;
 969   unsigned n_peel;
 970   VEC (edge, heap) *remove_edges;
 971   edge e;
 972   bool extra_zero_check, last_may_exit;
 973   unsigned max_unroll = loop->lpt_decision.times;
 974   struct niter_desc *desc = get_simple_loop_desc (loop);
 975   bool exit_at_end = loop_exit_at_end_p (loop);
 976   struct opt_info *opt_info = NULL;
 977   bool ok;
 978
 979   if (flag_split_ivs_in_unroller
 980       || flag_variable_expansion_in_unroller)
 981     opt_info = analyze_insns_in_loop (loop);
 982
 983   /* Remember blocks whose dominators will have to be updated.  */
 984   dom_bbs = NULL;
 985
 986   body = get_loop_body (loop);
 987   for (i = 0; i < loop->num_nodes; i++)
 988     {
 989       VEC (basic_block, heap) *ldom;
 990       basic_block bb;
 991
 992       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 993       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
 994         if (!flow_bb_inside_loop_p (loop, bb))
 995           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 996
 997       VEC_free (basic_block, heap, ldom);
 998     }
 999   free (body);
1000
1001   if (!exit_at_end)
1002     {
1003       /* Leave exit in first copy (for explanation why see comment in
1004          unroll_loop_constant_iterations).  */
1005       may_exit_copy = 0;
1006       n_peel = max_unroll - 1;
1007       extra_zero_check = true;
1008       last_may_exit = false;
1009     }
1010   else
1011     {
1012       /* Leave exit in last copy (for explanation why see comment in
1013          unroll_loop_constant_iterations).  */
1014       may_exit_copy = max_unroll;
1015       n_peel = max_unroll;
1016       extra_zero_check = false;
1017       last_may_exit = true;
1018     }
1019
1020   /* Get expression for number of iterations.  */
1021   start_sequence ();
1022   old_niter = niter = gen_reg_rtx (desc->mode);
1023   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1024   if (tmp != niter)
1025     emit_move_insn (niter, tmp);
1026
1027   /* Count modulo by ANDing it with max_unroll; we use the fact that
1028      the number of unrollings is a power of two, and thus this is correct
1029      even if there is overflow in the computation.  */
1030   niter = expand_simple_binop (desc->mode, AND,
1031                                niter,
1032                                GEN_INT (max_unroll),
1033                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1034
1035   init_code = get_insns ();
1036   end_sequence ();
1037   unshare_all_rtl_in_chain (init_code);
1038
1039   /* Precondition the loop.  */
1040   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1041
1042   remove_edges = NULL;
1043
1044   wont_exit = sbitmap_alloc (max_unroll + 2);
1045
1046   /* Peel the first copy of loop body (almost always we must leave exit test
1047      here; the only exception is when we have extra zero check and the number
1048      of iterations is reliable.  Also record the place of (possible) extra
1049      zero check.  */
1050   sbitmap_zero (wont_exit);
1051   if (extra_zero_check
1052       && !desc->noloop_assumptions)
1053     SET_BIT (wont_exit, 1);
1054   ezc_swtch = loop_preheader_edge (loop)->src;
1055   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1056                                       1, wont_exit, desc->out_edge,
1057                                       &remove_edges,
1058                                       DLTHE_FLAG_UPDATE_FREQ);
1059   gcc_assert (ok);
1060
1061   /* Record the place where switch will be built for preconditioning.  */
1062   swtch = split_edge (loop_preheader_edge (loop));
1063
1064   for (i = 0; i < n_peel; i++)
1065     {
1066       /* Peel the copy.  */
1067       sbitmap_zero (wont_exit);
1068       if (i != n_peel - 1 || !last_may_exit)
1069         SET_BIT (wont_exit, 1);
1070       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1071                                           1, wont_exit, desc->out_edge,
1072                                           &remove_edges,
1073                                           DLTHE_FLAG_UPDATE_FREQ);
1074       gcc_assert (ok);
1075
1076       /* Create item for switch.  */
1077       j = n_peel - i - (extra_zero_check ? 0 : 1);
1078       p = REG_BR_PROB_BASE / (i + 2);
1079
1080       preheader = split_edge (loop_preheader_edge (loop));
1081       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1082                                           block_label (preheader), p,
1083                                           NULL_RTX);
1084
1085       /* We rely on the fact that the compare and jump cannot be optimized out,
1086          and hence the cfg we create is correct.  */
1087       gcc_assert (branch_code != NULL_RTX);
1088
1089       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1090       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1091       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1092       e = make_edge (swtch, preheader,
1093                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1094       e->probability = p;
1095     }
1096
1097   if (extra_zero_check)
1098     {
1099       /* Add branch for zero iterations.  */
1100       p = REG_BR_PROB_BASE / (max_unroll + 1);
1101       swtch = ezc_swtch;
1102       preheader = split_edge (loop_preheader_edge (loop));
1103       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1104                                           block_label (preheader), p,
1105                                           NULL_RTX);
1106       gcc_assert (branch_code != NULL_RTX);
1107
1108       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1109       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1110       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1111       e = make_edge (swtch, preheader,
1112                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1113       e->probability = p;
1114     }
1115
1116   /* Recount dominators for outer blocks.  */
1117   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1118
1119   /* And unroll loop.  */
1120
1121   sbitmap_ones (wont_exit);
1122   RESET_BIT (wont_exit, may_exit_copy);
1123   opt_info_start_duplication (opt_info);
1124
1125   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1126                                       max_unroll,
1127                                       wont_exit, desc->out_edge,
1128                                       &remove_edges,
1129                                       DLTHE_FLAG_UPDATE_FREQ
1130                                       | (opt_info
1131                                          ? DLTHE_RECORD_COPY_NUMBER
1132                                            : 0));
1133   gcc_assert (ok);
1134
1135   if (opt_info)
1136     {
1137       apply_opt_in_copies (opt_info, max_unroll, true, true);
1138       free_opt_info (opt_info);
1139     }
1140
1141   free (wont_exit);
1142
1143   if (exit_at_end)
1144     {
1145       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1146       /* Find a new in and out edge; they are in the last copy we have
1147          made.  */
1148
1149       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1150         {
1151           desc->out_edge = EDGE_SUCC (exit_block, 0);
1152           desc->in_edge = EDGE_SUCC (exit_block, 1);
1153         }
1154       else
1155         {
1156           desc->out_edge = EDGE_SUCC (exit_block, 1);
1157           desc->in_edge = EDGE_SUCC (exit_block, 0);
1158         }
1159     }
1160
1161   /* Remove the edges.  */
1162   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1163     remove_path (e);
1164   VEC_free (edge, heap, remove_edges);
1165
1166   /* We must be careful when updating the number of iterations due to
1167      preconditioning and the fact that the value must be valid at entry
1168      of the loop.  After passing through the above code, we see that
1169      the correct new number of iterations is this:  */
1170   gcc_assert (!desc->const_iter);
1171   desc->niter_expr =
1172     simplify_gen_binary (UDIV, desc->mode, old_niter,
1173                          GEN_INT (max_unroll + 1));
1174   desc->niter_max /= max_unroll + 1;
1175   if (exit_at_end)
1176     {
1177       desc->niter_expr =
1178         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1179       desc->noloop_assumptions = NULL_RTX;
1180       desc->niter_max--;
1181     }
1182
1183   if (dump_file)
1184     fprintf (dump_file,
1185              ";; Unrolled loop %d times, counting # of iterations "
1186              "in runtime, %i insns\n",
1187              max_unroll, num_loop_insns (loop));
1188
1189   VEC_free (basic_block, heap, dom_bbs);
1190 }
1191
1192 /* Decide whether to simply peel LOOP and how much.  */
1193 static void
1194 decide_peel_simple (struct loop *loop, int flags)
1195 {
1196   unsigned npeel;
1197   struct niter_desc *desc;
1198
1199   if (!(flags & UAP_PEEL))
1200     {
1201       /* We were not asked to, just return back silently.  */
1202       return;
1203     }
1204
1205   if (dump_file)
1206     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1207
1208   /* npeel = number of iterations to peel.  */
1209   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1210   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1211     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1212
1213   /* Skip big loops.  */
1214   if (!npeel)
1215     {
1216       if (dump_file)
1217         fprintf (dump_file, ";; Not considering loop, is too big\n");
1218       return;
1219     }
1220
1221   /* Check for simple loops.  */
1222   desc = get_simple_loop_desc (loop);
1223
1224   /* Check number of iterations.  */
1225   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1226     {
1227       if (dump_file)
1228         fprintf (dump_file, ";; Loop iterates constant times\n");
1229       return;
1230     }
1231
1232   /* Do not simply peel loops with branches inside -- it increases number
1233      of mispredicts.  */
1234   if (num_loop_branches (loop) > 1)
1235     {
1236       if (dump_file)
1237         fprintf (dump_file, ";; Not peeling, contains branches\n");
1238       return;
1239     }
1240
1241   if (loop->header->count)
1242     {
1243       unsigned niter = expected_loop_iterations (loop);
1244       if (niter + 1 > npeel)
1245         {
1246           if (dump_file)
1247             {
1248               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1249               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1250                        (HOST_WIDEST_INT) (niter + 1));
1251               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1252                        npeel);
1253             }
1254           return;
1255         }
1256       npeel = niter + 1;
1257     }
1258   else
1259     {
1260       /* For now we have no good heuristics to decide whether loop peeling
1261          will be effective, so disable it.  */
1262       if (dump_file)
1263         fprintf (dump_file,
1264                  ";; Not peeling loop, no evidence it will be profitable\n");
1265       return;
1266     }
1267
1268   /* Success.  */
1269   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1270   loop->lpt_decision.times = npeel;
1271
1272   if (dump_file)
1273     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1274              loop->lpt_decision.times);
1275 }
1276
1277 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1278    while (cond)
1279      body;
1280
1281    ==>
1282
1283    if (!cond) goto end;
1284    body;
1285    if (!cond) goto end;
1286    body;
1287    while (cond)
1288      body;
1289    end: ;
1290    */
1291 static void
1292 peel_loop_simple (struct loop *loop)
1293 {
1294   sbitmap wont_exit;
1295   unsigned npeel = loop->lpt_decision.times;
1296   struct niter_desc *desc = get_simple_loop_desc (loop);
1297   struct opt_info *opt_info = NULL;
1298   bool ok;
1299
1300   if (flag_split_ivs_in_unroller && npeel > 1)
1301     opt_info = analyze_insns_in_loop (loop);
1302
1303   wont_exit = sbitmap_alloc (npeel + 1);
1304   sbitmap_zero (wont_exit);
1305
1306   opt_info_start_duplication (opt_info);
1307
1308   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1309                                       npeel, wont_exit, NULL,
1310                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1311                                       | (opt_info
1312                                          ? DLTHE_RECORD_COPY_NUMBER
1313                                            : 0));
1314   gcc_assert (ok);
1315
1316   free (wont_exit);
1317
1318   if (opt_info)
1319     {
1320       apply_opt_in_copies (opt_info, npeel, false, false);
1321       free_opt_info (opt_info);
1322     }
1323
1324   if (desc->simple_p)
1325     {
1326       if (desc->const_iter)
1327         {
1328           desc->niter -= npeel;
1329           desc->niter_expr = GEN_INT (desc->niter);
1330           desc->noloop_assumptions = NULL_RTX;
1331         }
1332       else
1333         {
1334           /* We cannot just update niter_expr, as its value might be clobbered
1335              inside loop.  We could handle this by counting the number into
1336              temporary just like we do in runtime unrolling, but it does not
1337              seem worthwhile.  */
1338           free_simple_loop_desc (loop);
1339         }
1340     }
1341   if (dump_file)
1342     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1343 }
1344
1345 /* Decide whether to unroll LOOP stupidly and how much.  */
1346 static void
1347 decide_unroll_stupid (struct loop *loop, int flags)
1348 {
1349   unsigned nunroll, nunroll_by_av, i;
1350   struct niter_desc *desc;
1351
1352   if (!(flags & UAP_UNROLL_ALL))
1353     {
1354       /* We were not asked to, just return back silently.  */
1355       return;
1356     }
1357
1358   if (dump_file)
1359     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1360
1361   /* nunroll = total number of copies of the original loop body in
1362      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1363   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1364   nunroll_by_av
1365     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1366   if (nunroll > nunroll_by_av)
1367     nunroll = nunroll_by_av;
1368   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1369     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1370
1371   if (targetm.loop_unroll_adjust)
1372     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1373
1374   /* Skip big loops.  */
1375   if (nunroll <= 1)
1376     {
1377       if (dump_file)
1378         fprintf (dump_file, ";; Not considering loop, is too big\n");
1379       return;
1380     }
1381
1382   /* Check for simple loops.  */
1383   desc = get_simple_loop_desc (loop);
1384
1385   /* Check simpleness.  */
1386   if (desc->simple_p && !desc->assumptions)
1387     {
1388       if (dump_file)
1389         fprintf (dump_file, ";; The loop is simple\n");
1390       return;
1391     }
1392
1393   /* Do not unroll loops with branches inside -- it increases number
1394      of mispredicts.  */
1395   if (num_loop_branches (loop) > 1)
1396     {
1397       if (dump_file)
1398         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1399       return;
1400     }
1401
1402   /* If we have profile feedback, check whether the loop rolls.  */
1403   if (loop->header->count
1404       && expected_loop_iterations (loop) < 2 * nunroll)
1405     {
1406       if (dump_file)
1407         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1408       return;
1409     }
1410
1411   /* Success.  Now force nunroll to be power of 2, as it seems that this
1412      improves results (partially because of better alignments, partially
1413      because of some dark magic).  */
1414   for (i = 1; 2 * i <= nunroll; i *= 2)
1415     continue;
1416
1417   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1418   loop->lpt_decision.times = i - 1;
1419
1420   if (dump_file)
1421     fprintf (dump_file,
1422              ";; Decided to unroll the loop stupidly, %d times.\n",
1423              loop->lpt_decision.times);
1424 }
1425
1426 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1427    while (cond)
1428      body;
1429
1430    ==>
1431
1432    while (cond)
1433      {
1434        body;
1435        if (!cond) break;
1436        body;
1437        if (!cond) break;
1438        body;
1439        if (!cond) break;
1440        body;
1441      }
1442    */
1443 static void
1444 unroll_loop_stupid (struct loop *loop)
1445 {
1446   sbitmap wont_exit;
1447   unsigned nunroll = loop->lpt_decision.times;
1448   struct niter_desc *desc = get_simple_loop_desc (loop);
1449   struct opt_info *opt_info = NULL;
1450   bool ok;
1451
1452   if (flag_split_ivs_in_unroller
1453       || flag_variable_expansion_in_unroller)
1454     opt_info = analyze_insns_in_loop (loop);
1455
1456
1457   wont_exit = sbitmap_alloc (nunroll + 1);
1458   sbitmap_zero (wont_exit);
1459   opt_info_start_duplication (opt_info);
1460
1461   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1462                                       nunroll, wont_exit,
1463                                       NULL, NULL,
1464                                       DLTHE_FLAG_UPDATE_FREQ
1465                                       | (opt_info
1466                                          ? DLTHE_RECORD_COPY_NUMBER
1467                                            : 0));
1468   gcc_assert (ok);
1469
1470   if (opt_info)
1471     {
1472       apply_opt_in_copies (opt_info, nunroll, true, true);
1473       free_opt_info (opt_info);
1474     }
1475
1476   free (wont_exit);
1477
1478   if (desc->simple_p)
1479     {
1480       /* We indeed may get here provided that there are nontrivial assumptions
1481          for a loop to be really simple.  We could update the counts, but the
1482          problem is that we are unable to decide which exit will be taken
1483          (not really true in case the number of iterations is constant,
1484          but noone will do anything with this information, so we do not
1485          worry about it).  */
1486       desc->simple_p = false;
1487     }
1488
1489   if (dump_file)
1490     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1491              nunroll, num_loop_insns (loop));
1492 }
1493
1494 /* A hash function for information about insns to split.  */
1495
1496 static hashval_t
1497 si_info_hash (const void *ivts)
1498 {
1499   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1500 }
1501
1502 /* An equality functions for information about insns to split.  */
1503
1504 static int
1505 si_info_eq (const void *ivts1, const void *ivts2)
1506 {
1507   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1508   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1509
1510   return i1->insn == i2->insn;
1511 }
1512
1513 /* Return a hash for VES, which is really a "var_to_expand *".  */
1514
1515 static hashval_t
1516 ve_info_hash (const void *ves)
1517 {
1518   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1519 }
1520
1521 /* Return true if IVTS1 and IVTS2 (which are really both of type
1522    "var_to_expand *") refer to the same instruction.  */
1523
1524 static int
1525 ve_info_eq (const void *ivts1, const void *ivts2)
1526 {
1527   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1528   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1529
1530   return i1->insn == i2->insn;
1531 }
1532
1533 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1534    Set *DEBUG_USES to the number of debug insns that reference the
1535    variable.  */
1536
1537 bool
1538 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1539                                   int *debug_uses)
1540 {
1541   basic_block *body, bb;
1542   unsigned i;
1543   int count_ref = 0;
1544   rtx insn;
1545
1546   body = get_loop_body (loop);
1547   for (i = 0; i < loop->num_nodes; i++)
1548     {
1549       bb = body[i];
1550
1551       FOR_BB_INSNS (bb, insn)
1552         if (!rtx_referenced_p (reg, insn))
1553           continue;
1554         else if (DEBUG_INSN_P (insn))
1555           ++*debug_uses;
1556         else if (++count_ref > 1)
1557           break;
1558     }
1559   free (body);
1560   return (count_ref  == 1);
1561 }
1562
1563 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1564
1565 static void
1566 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1567 {
1568   basic_block *body, bb;
1569   unsigned i;
1570   rtx insn;
1571
1572   body = get_loop_body (loop);
1573   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1574     {
1575       bb = body[i];
1576
1577       FOR_BB_INSNS (bb, insn)
1578         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1579           continue;
1580         else
1581           {
1582             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1583                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1584             if (!--debug_uses)
1585               break;
1586           }
1587     }
1588   free (body);
1589 }
1590
1591 /* Determine whether INSN contains an accumulator
1592    which can be expanded into separate copies,
1593    one for each copy of the LOOP body.
1594
1595    for (i = 0 ; i < n; i++)
1596      sum += a[i];
1597
1598    ==>
1599
1600    sum += a[i]
1601    ....
1602    i = i+1;
1603    sum1 += a[i]
1604    ....
1605    i = i+1
1606    sum2 += a[i];
1607    ....
1608
1609    Return NULL if INSN contains no opportunity for expansion of accumulator.
1610    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1611    information and return a pointer to it.
1612 */
1613
1614 static struct var_to_expand *
1615 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1616 {
1617   rtx set, dest, src;
1618   struct var_to_expand *ves;
1619   unsigned accum_pos;
1620   enum rtx_code code;
1621   int debug_uses = 0;
1622
1623   set = single_set (insn);
1624   if (!set)
1625     return NULL;
1626
1627   dest = SET_DEST (set);
1628   src = SET_SRC (set);
1629   code = GET_CODE (src);
1630
1631   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1632     return NULL;
1633
1634   if (FLOAT_MODE_P (GET_MODE (dest)))
1635     {
1636       if (!flag_associative_math)
1637         return NULL;
1638       /* In the case of FMA, we're also changing the rounding.  */
1639       if (code == FMA && !flag_unsafe_math_optimizations)
1640         return NULL;
1641     }
1642
1643   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1644      in MD.  But if there is no optab to generate the insn, we can not
1645      perform the variable expansion.  This can happen if an MD provides
1646      an insn but not a named pattern to generate it, for example to avoid
1647      producing code that needs additional mode switches like for x87/mmx.
1648
1649      So we check have_insn_for which looks for an optab for the operation
1650      in SRC.  If it doesn't exist, we can't perform the expansion even
1651      though INSN is valid.  */
1652   if (!have_insn_for (code, GET_MODE (src)))
1653     return NULL;
1654
1655   if (!REG_P (dest)
1656       && !(GET_CODE (dest) == SUBREG
1657            && REG_P (SUBREG_REG (dest))))
1658     return NULL;
1659
1660   /* Find the accumulator use within the operation.  */
1661   if (code == FMA)
1662     {
1663       /* We only support accumulation via FMA in the ADD position.  */
1664       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1665         return NULL;
1666       accum_pos = 2;
1667     }
1668   else if (rtx_equal_p (dest, XEXP (src, 0)))
1669     accum_pos = 0;
1670   else if (rtx_equal_p (dest, XEXP (src, 1)))
1671     {
1672       /* The method of expansion that we are using; which includes the
1673          initialization of the expansions with zero and the summation of
1674          the expansions at the end of the computation will yield wrong
1675          results for (x = something - x) thus avoid using it in that case.  */
1676       if (code == MINUS)
1677         return NULL;
1678       accum_pos = 1;
1679     }
1680   else
1681     return NULL;
1682
1683   /* It must not otherwise be used.  */
1684   if (code == FMA)
1685     {
1686       if (rtx_referenced_p (dest, XEXP (src, 0))
1687           || rtx_referenced_p (dest, XEXP (src, 1)))
1688         return NULL;
1689     }
1690   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1691     return NULL;
1692
1693   /* It must be used in exactly one insn.  */
1694   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1695     return NULL;
1696
1697   if (dump_file)
1698     {
1699       fprintf (dump_file, "\n;; Expanding Accumulator ");
1700       print_rtl (dump_file, dest);
1701       fprintf (dump_file, "\n");
1702     }
1703
1704   if (debug_uses)
1705     /* Instead of resetting the debug insns, we could replace each
1706        debug use in the loop with the sum or product of all expanded
1707        accummulators.  Since we'll only know of all expansions at the
1708        end, we'd have to keep track of which vars_to_expand a debug
1709        insn in the loop references, take note of each copy of the
1710        debug insn during unrolling, and when it's all done, compute
1711        the sum or product of each variable and adjust the original
1712        debug insn and each copy thereof.  What a pain!  */
1713     reset_debug_uses_in_loop (loop, dest, debug_uses);
1714
1715   /* Record the accumulator to expand.  */
1716   ves = XNEW (struct var_to_expand);
1717   ves->insn = insn;
1718   ves->reg = copy_rtx (dest);
1719   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1720   ves->next = NULL;
1721   ves->op = GET_CODE (src);
1722   ves->expansion_count = 0;
1723   ves->reuse_expansion = 0;
1724   ves->accum_pos = accum_pos;
1725   return ves;
1726 }
1727
1728 /* Determine whether there is an induction variable in INSN that
1729    we would like to split during unrolling.
1730
1731    I.e. replace
1732
1733    i = i + 1;
1734    ...
1735    i = i + 1;
1736    ...
1737    i = i + 1;
1738    ...
1739
1740    type chains by
1741
1742    i0 = i + 1
1743    ...
1744    i = i0 + 1
1745    ...
1746    i = i0 + 2
1747    ...
1748
1749    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1750    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1751    pointer to it.  */
1752
1753 static struct iv_to_split *
1754 analyze_iv_to_split_insn (rtx insn)
1755 {
1756   rtx set, dest;
1757   struct rtx_iv iv;
1758   struct iv_to_split *ivts;
1759   bool ok;
1760
1761   /* For now we just split the basic induction variables.  Later this may be
1762      extended for example by selecting also addresses of memory references.  */
1763   set = single_set (insn);
1764   if (!set)
1765     return NULL;
1766
1767   dest = SET_DEST (set);
1768   if (!REG_P (dest))
1769     return NULL;
1770
1771   if (!biv_p (insn, dest))
1772     return NULL;
1773
1774   ok = iv_analyze_result (insn, dest, &iv);
1775
1776   /* This used to be an assert under the assumption that if biv_p returns
1777      true that iv_analyze_result must also return true.  However, that
1778      assumption is not strictly correct as evidenced by pr25569.
1779
1780      Returning NULL when iv_analyze_result returns false is safe and
1781      avoids the problems in pr25569 until the iv_analyze_* routines
1782      can be fixed, which is apparently hard and time consuming
1783      according to their author.  */
1784   if (! ok)
1785     return NULL;
1786
1787   if (iv.step == const0_rtx
1788       || iv.mode != iv.extend_mode)
1789     return NULL;
1790
1791   /* Record the insn to split.  */
1792   ivts = XNEW (struct iv_to_split);
1793   ivts->insn = insn;
1794   ivts->base_var = NULL_RTX;
1795   ivts->step = iv.step;
1796   ivts->next = NULL;
1797   ivts->n_loc = 1;
1798   ivts->loc[0] = 1;
1799
1800   return ivts;
1801 }
1802
1803 /* Determines which of insns in LOOP can be optimized.
1804    Return a OPT_INFO struct with the relevant hash tables filled
1805    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1806    is undefined for the return value.  */
1807
1808 static struct opt_info *
1809 analyze_insns_in_loop (struct loop *loop)
1810 {
1811   basic_block *body, bb;
1812   unsigned i;
1813   struct opt_info *opt_info = XCNEW (struct opt_info);
1814   rtx insn;
1815   struct iv_to_split *ivts = NULL;
1816   struct var_to_expand *ves = NULL;
1817   PTR *slot1;
1818   PTR *slot2;
1819   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1820   edge exit;
1821   bool can_apply = false;
1822
1823   iv_analysis_loop_init (loop);
1824
1825   body = get_loop_body (loop);
1826
1827   if (flag_split_ivs_in_unroller)
1828     {
1829       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1830                                               si_info_hash, si_info_eq, free);
1831       opt_info->iv_to_split_head = NULL;
1832       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1833     }
1834
1835   /* Record the loop exit bb and loop preheader before the unrolling.  */
1836   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1837
1838   if (VEC_length (edge, edges) == 1)
1839     {
1840       exit = VEC_index (edge, edges, 0);
1841       if (!(exit->flags & EDGE_COMPLEX))
1842         {
1843           opt_info->loop_exit = split_edge (exit);
1844           can_apply = true;
1845         }
1846     }
1847
1848   if (flag_variable_expansion_in_unroller
1849       && can_apply)
1850     {
1851       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1852                                                         ve_info_hash,
1853                                                         ve_info_eq, free);
1854       opt_info->var_to_expand_head = NULL;
1855       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1856     }
1857
1858   for (i = 0; i < loop->num_nodes; i++)
1859     {
1860       bb = body[i];
1861       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1862         continue;
1863
1864       FOR_BB_INSNS (bb, insn)
1865       {
1866         if (!INSN_P (insn))
1867           continue;
1868
1869         if (opt_info->insns_to_split)
1870           ivts = analyze_iv_to_split_insn (insn);
1871
1872         if (ivts)
1873           {
1874             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1875             gcc_assert (*slot1 == NULL);
1876             *slot1 = ivts;
1877             *opt_info->iv_to_split_tail = ivts;
1878             opt_info->iv_to_split_tail = &ivts->next;
1879             continue;
1880           }
1881
1882         if (opt_info->insns_with_var_to_expand)
1883           ves = analyze_insn_to_expand_var (loop, insn);
1884
1885         if (ves)
1886           {
1887             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1888             gcc_assert (*slot2 == NULL);
1889             *slot2 = ves;
1890             *opt_info->var_to_expand_tail = ves;
1891             opt_info->var_to_expand_tail = &ves->next;
1892           }
1893       }
1894     }
1895
1896   VEC_free (edge, heap, edges);
1897   free (body);
1898   return opt_info;
1899 }
1900
1901 /* Called just before loop duplication.  Records start of duplicated area
1902    to OPT_INFO.  */
1903
1904 static void
1905 opt_info_start_duplication (struct opt_info *opt_info)
1906 {
1907   if (opt_info)
1908     opt_info->first_new_block = last_basic_block;
1909 }
1910
1911 /* Determine the number of iterations between initialization of the base
1912    variable and the current copy (N_COPY).  N_COPIES is the total number
1913    of newly created copies.  UNROLLING is true if we are unrolling
1914    (not peeling) the loop.  */
1915
1916 static unsigned
1917 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1918 {
1919   if (unrolling)
1920     {
1921       /* If we are unrolling, initialization is done in the original loop
1922          body (number 0).  */
1923       return n_copy;
1924     }
1925   else
1926     {
1927       /* If we are peeling, the copy in that the initialization occurs has
1928          number 1.  The original loop (number 0) is the last.  */
1929       if (n_copy)
1930         return n_copy - 1;
1931       else
1932         return n_copies;
1933     }
1934 }
1935
1936 /* Locate in EXPR the expression corresponding to the location recorded
1937    in IVTS, and return a pointer to the RTX for this location.  */
1938
1939 static rtx *
1940 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1941 {
1942   unsigned i;
1943   rtx *ret = &expr;
1944
1945   for (i = 0; i < ivts->n_loc; i++)
1946     ret = &XEXP (*ret, ivts->loc[i]);
1947
1948   return ret;
1949 }
1950
1951 /* Allocate basic variable for the induction variable chain.  */
1952
1953 static void
1954 allocate_basic_variable (struct iv_to_split *ivts)
1955 {
1956   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1957
1958   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1959 }
1960
1961 /* Insert initialization of basic variable of IVTS before INSN, taking
1962    the initial value from INSN.  */
1963
1964 static void
1965 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1966 {
1967   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1968   rtx seq;
1969
1970   start_sequence ();
1971   expr = force_operand (expr, ivts->base_var);
1972   if (expr != ivts->base_var)
1973     emit_move_insn (ivts->base_var, expr);
1974   seq = get_insns ();
1975   end_sequence ();
1976
1977   emit_insn_before (seq, insn);
1978 }
1979
1980 /* Replace the use of induction variable described in IVTS in INSN
1981    by base variable + DELTA * step.  */
1982
1983 static void
1984 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1985 {
1986   rtx expr, *loc, seq, incr, var;
1987   enum machine_mode mode = GET_MODE (ivts->base_var);
1988   rtx src, dest, set;
1989
1990   /* Construct base + DELTA * step.  */
1991   if (!delta)
1992     expr = ivts->base_var;
1993   else
1994     {
1995       incr = simplify_gen_binary (MULT, mode,
1996                                   ivts->step, gen_int_mode (delta, mode));
1997       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1998                                   ivts->base_var, incr);
1999     }
2000
2001   /* Figure out where to do the replacement.  */
2002   loc = get_ivts_expr (single_set (insn), ivts);
2003
2004   /* If we can make the replacement right away, we're done.  */
2005   if (validate_change (insn, loc, expr, 0))
2006     return;
2007
2008   /* Otherwise, force EXPR into a register and try again.  */
2009   start_sequence ();
2010   var = gen_reg_rtx (mode);
2011   expr = force_operand (expr, var);
2012   if (expr != var)
2013     emit_move_insn (var, expr);
2014   seq = get_insns ();
2015   end_sequence ();
2016   emit_insn_before (seq, insn);
2017
2018   if (validate_change (insn, loc, var, 0))
2019     return;
2020
2021   /* The last chance.  Try recreating the assignment in insn
2022      completely from scratch.  */
2023   set = single_set (insn);
2024   gcc_assert (set);
2025
2026   start_sequence ();
2027   *loc = var;
2028   src = copy_rtx (SET_SRC (set));
2029   dest = copy_rtx (SET_DEST (set));
2030   src = force_operand (src, dest);
2031   if (src != dest)
2032     emit_move_insn (dest, src);
2033   seq = get_insns ();
2034   end_sequence ();
2035
2036   emit_insn_before (seq, insn);
2037   delete_insn (insn);
2038 }
2039
2040
2041 /* Return one expansion of the accumulator recorded in struct VE.  */
2042
2043 static rtx
2044 get_expansion (struct var_to_expand *ve)
2045 {
2046   rtx reg;
2047
2048   if (ve->reuse_expansion == 0)
2049     reg = ve->reg;
2050   else
2051     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2052
2053   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2054     ve->reuse_expansion = 0;
2055   else
2056     ve->reuse_expansion++;
2057
2058   return reg;
2059 }
2060
2061
2062 /* Given INSN replace the uses of the accumulator recorded in VE
2063    with a new register.  */
2064
2065 static void
2066 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2067 {
2068   rtx new_reg, set;
2069   bool really_new_expansion = false;
2070
2071   set = single_set (insn);
2072   gcc_assert (set);
2073
2074   /* Generate a new register only if the expansion limit has not been
2075      reached.  Else reuse an already existing expansion.  */
2076   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2077     {
2078       really_new_expansion = true;
2079       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2080     }
2081   else
2082     new_reg = get_expansion (ve);
2083
2084   validate_change (insn, &SET_DEST (set), new_reg, 1);
2085   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2086
2087   if (apply_change_group ())
2088     if (really_new_expansion)
2089       {
2090         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2091         ve->expansion_count++;
2092       }
2093 }
2094
2095 /* Initialize the variable expansions in loop preheader.  PLACE is the
2096    loop-preheader basic block where the initialization of the
2097    expansions should take place.  The expansions are initialized with
2098    (-0) when the operation is plus or minus to honor sign zero.  This
2099    way we can prevent cases where the sign of the final result is
2100    effected by the sign of the expansion.  Here is an example to
2101    demonstrate this:
2102
2103    for (i = 0 ; i < n; i++)
2104      sum += something;
2105
2106    ==>
2107
2108    sum += something
2109    ....
2110    i = i+1;
2111    sum1 += something
2112    ....
2113    i = i+1
2114    sum2 += something;
2115    ....
2116
2117    When SUM is initialized with -zero and SOMETHING is also -zero; the
2118    final result of sum should be -zero thus the expansions sum1 and sum2
2119    should be initialized with -zero as well (otherwise we will get +zero
2120    as the final result).  */
2121
2122 static void
2123 insert_var_expansion_initialization (struct var_to_expand *ve,
2124                                      basic_block place)
2125 {
2126   rtx seq, var, zero_init, insn;
2127   unsigned i;
2128   enum machine_mode mode = GET_MODE (ve->reg);
2129   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2130
2131   if (VEC_length (rtx, ve->var_expansions) == 0)
2132     return;
2133
2134   start_sequence ();
2135   switch (ve->op)
2136     {
2137     case FMA:
2138       /* Note that we only accumulate FMA via the ADD operand.  */
2139     case PLUS:
2140     case MINUS:
2141       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2142         {
2143           if (honor_signed_zero_p)
2144             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2145           else
2146             zero_init = CONST0_RTX (mode);
2147           emit_move_insn (var, zero_init);
2148         }
2149       break;
2150
2151     case MULT:
2152       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2153         {
2154           zero_init = CONST1_RTX (GET_MODE (var));
2155           emit_move_insn (var, zero_init);
2156         }
2157       break;
2158
2159     default:
2160       gcc_unreachable ();
2161     }
2162
2163   seq = get_insns ();
2164   end_sequence ();
2165
2166   insn = BB_HEAD (place);
2167   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2168     insn = NEXT_INSN (insn);
2169
2170   emit_insn_after (seq, insn);
2171 }
2172
2173 /* Combine the variable expansions at the loop exit.  PLACE is the
2174    loop exit basic block where the summation of the expansions should
2175    take place.  */
2176
2177 static void
2178 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2179 {
2180   rtx sum = ve->reg;
2181   rtx expr, seq, var, insn;
2182   unsigned i;
2183
2184   if (VEC_length (rtx, ve->var_expansions) == 0)
2185     return;
2186
2187   start_sequence ();
2188   switch (ve->op)
2189     {
2190     case FMA:
2191       /* Note that we only accumulate FMA via the ADD operand.  */
2192     case PLUS:
2193     case MINUS:
2194       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2195         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2196       break;
2197
2198     case MULT:
2199       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2200         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2201       break;
2202
2203     default:
2204       gcc_unreachable ();
2205     }
2206
2207   expr = force_operand (sum, ve->reg);
2208   if (expr != ve->reg)
2209     emit_move_insn (ve->reg, expr);
2210   seq = get_insns ();
2211   end_sequence ();
2212
2213   insn = BB_HEAD (place);
2214   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2215     insn = NEXT_INSN (insn);
2216
2217   emit_insn_after (seq, insn);
2218 }
2219
2220 /* Apply loop optimizations in loop copies using the
2221    data which gathered during the unrolling.  Structure
2222    OPT_INFO record that data.
2223
2224    UNROLLING is true if we unrolled (not peeled) the loop.
2225    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2226    the loop (as it should happen in complete unrolling, but not in ordinary
2227    peeling of the loop).  */
2228
2229 static void
2230 apply_opt_in_copies (struct opt_info *opt_info,
2231                      unsigned n_copies, bool unrolling,
2232                      bool rewrite_original_loop)
2233 {
2234   unsigned i, delta;
2235   basic_block bb, orig_bb;
2236   rtx insn, orig_insn, next;
2237   struct iv_to_split ivts_templ, *ivts;
2238   struct var_to_expand ve_templ, *ves;
2239
2240   /* Sanity check -- we need to put initialization in the original loop
2241      body.  */
2242   gcc_assert (!unrolling || rewrite_original_loop);
2243
2244   /* Allocate the basic variables (i0).  */
2245   if (opt_info->insns_to_split)
2246     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2247       allocate_basic_variable (ivts);
2248
2249   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2250     {
2251       bb = BASIC_BLOCK (i);
2252       orig_bb = get_bb_original (bb);
2253
2254       /* bb->aux holds position in copy sequence initialized by
2255          duplicate_loop_to_header_edge.  */
2256       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2257                                         unrolling);
2258       bb->aux = 0;
2259       orig_insn = BB_HEAD (orig_bb);
2260       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2261         {
2262           next = NEXT_INSN (insn);
2263           if (!INSN_P (insn)
2264               || (DEBUG_INSN_P (insn)
2265                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2266             continue;
2267
2268           while (!INSN_P (orig_insn)
2269                  || (DEBUG_INSN_P (orig_insn)
2270                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2271                          == LABEL_DECL)))
2272             orig_insn = NEXT_INSN (orig_insn);
2273
2274           ivts_templ.insn = orig_insn;
2275           ve_templ.insn = orig_insn;
2276
2277           /* Apply splitting iv optimization.  */
2278           if (opt_info->insns_to_split)
2279             {
2280               ivts = (struct iv_to_split *)
2281                 htab_find (opt_info->insns_to_split, &ivts_templ);
2282
2283               if (ivts)
2284                 {
2285                   gcc_assert (GET_CODE (PATTERN (insn))
2286                               == GET_CODE (PATTERN (orig_insn)));
2287
2288                   if (!delta)
2289                     insert_base_initialization (ivts, insn);
2290                   split_iv (ivts, insn, delta);
2291                 }
2292             }
2293           /* Apply variable expansion optimization.  */
2294           if (unrolling && opt_info->insns_with_var_to_expand)
2295             {
2296               ves = (struct var_to_expand *)
2297                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2298               if (ves)
2299                 {
2300                   gcc_assert (GET_CODE (PATTERN (insn))
2301                               == GET_CODE (PATTERN (orig_insn)));
2302                   expand_var_during_unrolling (ves, insn);
2303                 }
2304             }
2305           orig_insn = NEXT_INSN (orig_insn);
2306         }
2307     }
2308
2309   if (!rewrite_original_loop)
2310     return;
2311
2312   /* Initialize the variable expansions in the loop preheader
2313      and take care of combining them at the loop exit.  */
2314   if (opt_info->insns_with_var_to_expand)
2315     {
2316       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2317         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2318       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2319         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2320     }
2321
2322   /* Rewrite also the original loop body.  Find them as originals of the blocks
2323      in the last copied iteration, i.e. those that have
2324      get_bb_copy (get_bb_original (bb)) == bb.  */
2325   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2326     {
2327       bb = BASIC_BLOCK (i);
2328       orig_bb = get_bb_original (bb);
2329       if (get_bb_copy (orig_bb) != bb)
2330         continue;
2331
2332       delta = determine_split_iv_delta (0, n_copies, unrolling);
2333       for (orig_insn = BB_HEAD (orig_bb);
2334            orig_insn != NEXT_INSN (BB_END (bb));
2335            orig_insn = next)
2336         {
2337           next = NEXT_INSN (orig_insn);
2338
2339           if (!INSN_P (orig_insn))
2340             continue;
2341
2342           ivts_templ.insn = orig_insn;
2343           if (opt_info->insns_to_split)
2344             {
2345               ivts = (struct iv_to_split *)
2346                 htab_find (opt_info->insns_to_split, &ivts_templ);
2347               if (ivts)
2348                 {
2349                   if (!delta)
2350                     insert_base_initialization (ivts, orig_insn);
2351                   split_iv (ivts, orig_insn, delta);
2352                   continue;
2353                 }
2354             }
2355
2356         }
2357     }
2358 }
2359
2360 /* Release OPT_INFO.  */
2361
2362 static void
2363 free_opt_info (struct opt_info *opt_info)
2364 {
2365   if (opt_info->insns_to_split)
2366     htab_delete (opt_info->insns_to_split);
2367   if (opt_info->insns_with_var_to_expand)
2368     {
2369       struct var_to_expand *ves;
2370
2371       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2372         VEC_free (rtx, heap, ves->var_expansions);
2373       htab_delete (opt_info->insns_with_var_to_expand);
2374     }
2375   free (opt_info);
2376 }