gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Hashtable helper for iv_to_split.  */
 107
 108 struct iv_split_hasher : typed_free_remove <iv_to_split>
 109 {
 110   typedef iv_to_split value_type;
 111   typedef iv_to_split compare_type;
 112   static inline hashval_t hash (const value_type *);
 113   static inline bool equal (const value_type *, const compare_type *);
 114 };
 115
 116
 117 /* A hash function for information about insns to split.  */
 118
 119 inline hashval_t
 120 iv_split_hasher::hash (const value_type *ivts)
 121 {
 122   return (hashval_t) INSN_UID (ivts->insn);
 123 }
 124
 125 /* An equality functions for information about insns to split.  */
 126
 127 inline bool
 128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 129 {
 130   return i1->insn == i2->insn;
 131 }
 132
 133 /* Hashtable helper for iv_to_split.  */
 134
 135 struct var_expand_hasher : typed_free_remove <var_to_expand>
 136 {
 137   typedef var_to_expand value_type;
 138   typedef var_to_expand compare_type;
 139   static inline hashval_t hash (const value_type *);
 140   static inline bool equal (const value_type *, const compare_type *);
 141 };
 142
 143 /* Return a hash for VES.  */
 144
 145 inline hashval_t
 146 var_expand_hasher::hash (const value_type *ves)
 147 {
 148   return (hashval_t) INSN_UID (ves->insn);
 149 }
 150
 151 /* Return true if I1 and I2 refer to the same instruction.  */
 152
 153 inline bool
 154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 155 {
 156   return i1->insn == i2->insn;
 157 }
 158
 159 /* Information about optimization applied in
 160    the unrolled loop.  */
 161
 162 struct opt_info
 163 {
 164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 165                                                   split.  */
 166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 169                                         insns with accumulators to expand.  */
 170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 172   unsigned first_new_block;        /* The first basic block that was
 173                                       duplicated.  */
 174   basic_block loop_exit;           /* The loop exit basic block.  */
 175   basic_block loop_preheader;      /* The loop preheader basic block.  */
 176 };
 177
 178 static void decide_unrolling_and_peeling (int);
 179 static void peel_loops_completely (int);
 180 static void decide_peel_simple (struct loop *, int);
 181 static void decide_peel_once_rolling (struct loop *, int);
 182 static void decide_peel_completely (struct loop *, int);
 183 static void decide_unroll_stupid (struct loop *, int);
 184 static void decide_unroll_constant_iterations (struct loop *, int);
 185 static void decide_unroll_runtime_iterations (struct loop *, int);
 186 static void peel_loop_simple (struct loop *);
 187 static void peel_loop_completely (struct loop *);
 188 static void unroll_loop_stupid (struct loop *);
 189 static void unroll_loop_constant_iterations (struct loop *);
 190 static void unroll_loop_runtime_iterations (struct loop *);
 191 static struct opt_info *analyze_insns_in_loop (struct loop *);
 192 static void opt_info_start_duplication (struct opt_info *);
 193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 194 static void free_opt_info (struct opt_info *);
 195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 199 static void insert_var_expansion_initialization (struct var_to_expand *,
 200                                                  basic_block);
 201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 202                                              basic_block);
 203 static rtx get_expansion (struct var_to_expand *);
 204
 205 /* Emit a message summarizing the unroll or peel that will be
 206    performed for LOOP, along with the loop's location LOCUS, if
 207    appropriate given the dump or -fopt-info settings.  */
 208
 209 static void
 210 report_unroll_peel (struct loop *loop, location_t locus)
 211 {
 212   struct niter_desc *desc;
 213   int niters = 0;
 214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 215
 216   if (loop->lpt_decision.decision == LPT_NONE)
 217     return;
 218
 219   if (!dump_enabled_p ())
 220     return;
 221
 222   /* In the special case where the loop never iterated, emit
 223      a different message so that we don't report an unroll by 0.
 224      This matches the equivalent message emitted during tree unrolling.  */
 225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 226       && !loop->lpt_decision.times)
 227     {
 228       dump_printf_loc (report_flags, locus,
 229                        "loop turned into non-loop; it never loops.\n");
 230       return;
 231     }
 232
 233   desc = get_simple_loop_desc (loop);
 234
 235   if (desc->const_iter)
 236     niters = desc->niter;
 237   else if (loop->header->count)
 238     niters = expected_loop_iterations (loop);
 239
 240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 241     dump_printf_loc (report_flags, locus,
 242                      "loop with %d iterations completely unrolled",
 243                      loop->lpt_decision.times + 1);
 244   else
 245     dump_printf_loc (report_flags, locus,
 246                      "loop %s %d times",
 247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 248                        ? "peeled" : "unrolled"),
 249                      loop->lpt_decision.times);
 250   if (profile_info)
 251     dump_printf (report_flags,
 252                  " (header execution count %d",
 253                  (int)loop->header->count);
 254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255     dump_printf (report_flags,
 256                  "%s%s iterations %d)",
 257                  profile_info ? ", " : " (",
 258                  desc->const_iter ? "const" : "average",
 259                  niters);
 260   else if (profile_info)
 261     dump_printf (report_flags, ")");
 262
 263   dump_printf (report_flags, "\n");
 264 }
 265
 266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 267 void
 268 unroll_and_peel_loops (int flags)
 269 {
 270   struct loop *loop;
 271   bool changed = false;
 272   loop_iterator li;
 273
 274   /* First perform complete loop peeling (it is almost surely a win,
 275      and affects parameters for further decision a lot).  */
 276   peel_loops_completely (flags);
 277
 278   /* Now decide rest of unrolling and peeling.  */
 279   decide_unrolling_and_peeling (flags);
 280
 281   /* Scan the loops, inner ones first.  */
 282   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 283     {
 284       /* And perform the appropriate transformations.  */
 285       switch (loop->lpt_decision.decision)
 286         {
 287         case LPT_PEEL_COMPLETELY:
 288           /* Already done.  */
 289           gcc_unreachable ();
 290         case LPT_PEEL_SIMPLE:
 291           peel_loop_simple (loop);
 292           changed = true;
 293           break;
 294         case LPT_UNROLL_CONSTANT:
 295           unroll_loop_constant_iterations (loop);
 296           changed = true;
 297           break;
 298         case LPT_UNROLL_RUNTIME:
 299           unroll_loop_runtime_iterations (loop);
 300           changed = true;
 301           break;
 302         case LPT_UNROLL_STUPID:
 303           unroll_loop_stupid (loop);
 304           changed = true;
 305           break;
 306         case LPT_NONE:
 307           break;
 308         default:
 309           gcc_unreachable ();
 310         }
 311     }
 312
 313     if (changed)
 314       {
 315         calculate_dominance_info (CDI_DOMINATORS);
 316         fix_loop_structure (NULL);
 317       }
 318
 319   iv_analysis_done ();
 320 }
 321
 322 /* Check whether exit of the LOOP is at the end of loop body.  */
 323
 324 static bool
 325 loop_exit_at_end_p (struct loop *loop)
 326 {
 327   struct niter_desc *desc = get_simple_loop_desc (loop);
 328   rtx insn;
 329
 330   if (desc->in_edge->dest != loop->latch)
 331     return false;
 332
 333   /* Check that the latch is empty.  */
 334   FOR_BB_INSNS (loop->latch, insn)
 335     {
 336       if (NONDEBUG_INSN_P (insn))
 337         return false;
 338     }
 339
 340   return true;
 341 }
 342
 343 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 344 static void
 345 peel_loops_completely (int flags)
 346 {
 347   struct loop *loop;
 348   loop_iterator li;
 349   bool changed = false;
 350
 351   /* Scan the loops, the inner ones first.  */
 352   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 353     {
 354       loop->lpt_decision.decision = LPT_NONE;
 355       location_t locus = get_loop_location (loop);
 356
 357       if (dump_enabled_p ())
 358         dump_printf_loc (TDF_RTL, locus,
 359                          ";; *** Considering loop %d at BB %d for "
 360                          "complete peeling ***\n",
 361                          loop->num, loop->header->index);
 362
 363       loop->ninsns = num_loop_insns (loop);
 364
 365       decide_peel_once_rolling (loop, flags);
 366       if (loop->lpt_decision.decision == LPT_NONE)
 367         decide_peel_completely (loop, flags);
 368
 369       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 370         {
 371           report_unroll_peel (loop, locus);
 372           peel_loop_completely (loop);
 373           changed = true;
 374         }
 375     }
 376
 377     if (changed)
 378       {
 379         calculate_dominance_info (CDI_DOMINATORS);
 380         fix_loop_structure (NULL);
 381       }
 382 }
 383
 384 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 385 static void
 386 decide_unrolling_and_peeling (int flags)
 387 {
 388   struct loop *loop;
 389   loop_iterator li;
 390
 391   /* Scan the loops, inner ones first.  */
 392   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 393     {
 394       loop->lpt_decision.decision = LPT_NONE;
 395       location_t locus = get_loop_location (loop);
 396
 397       if (dump_enabled_p ())
 398         dump_printf_loc (TDF_RTL, locus,
 399                          ";; *** Considering loop %d at BB %d for "
 400                          "unrolling and peeling ***\n",
 401                          loop->num, loop->header->index);
 402
 403       /* Do not peel cold areas.  */
 404       if (optimize_loop_for_size_p (loop))
 405         {
 406           if (dump_file)
 407             fprintf (dump_file, ";; Not considering loop, cold area\n");
 408           continue;
 409         }
 410
 411       /* Can the loop be manipulated?  */
 412       if (!can_duplicate_loop_p (loop))
 413         {
 414           if (dump_file)
 415             fprintf (dump_file,
 416                      ";; Not considering loop, cannot duplicate\n");
 417           continue;
 418         }
 419
 420       /* Skip non-innermost loops.  */
 421       if (loop->inner)
 422         {
 423           if (dump_file)
 424             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 425           continue;
 426         }
 427
 428       loop->ninsns = num_loop_insns (loop);
 429       loop->av_ninsns = average_num_loop_insns (loop);
 430
 431       /* Try transformations one by one in decreasing order of
 432          priority.  */
 433
 434       decide_unroll_constant_iterations (loop, flags);
 435       if (loop->lpt_decision.decision == LPT_NONE)
 436         decide_unroll_runtime_iterations (loop, flags);
 437       if (loop->lpt_decision.decision == LPT_NONE)
 438         decide_unroll_stupid (loop, flags);
 439       if (loop->lpt_decision.decision == LPT_NONE)
 440         decide_peel_simple (loop, flags);
 441
 442       report_unroll_peel (loop, locus);
 443     }
 444 }
 445
 446 /* Decide whether the LOOP is once rolling and suitable for complete
 447    peeling.  */
 448 static void
 449 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 450 {
 451   struct niter_desc *desc;
 452
 453   if (dump_file)
 454     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 455
 456   /* Is the loop small enough?  */
 457   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 458     {
 459       if (dump_file)
 460         fprintf (dump_file, ";; Not considering loop, is too big\n");
 461       return;
 462     }
 463
 464   /* Check for simple loops.  */
 465   desc = get_simple_loop_desc (loop);
 466
 467   /* Check number of iterations.  */
 468   if (!desc->simple_p
 469       || desc->assumptions
 470       || desc->infinite
 471       || !desc->const_iter
 472       || (desc->niter != 0
 473           && get_max_loop_iterations_int (loop) != 0))
 474     {
 475       if (dump_file)
 476         fprintf (dump_file,
 477                  ";; Unable to prove that the loop rolls exactly once\n");
 478       return;
 479     }
 480
 481   /* Success.  */
 482   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 483 }
 484
 485 /* Decide whether the LOOP is suitable for complete peeling.  */
 486 static void
 487 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 488 {
 489   unsigned npeel;
 490   struct niter_desc *desc;
 491
 492   if (dump_file)
 493     fprintf (dump_file, "\n;; Considering peeling completely\n");
 494
 495   /* Skip non-innermost loops.  */
 496   if (loop->inner)
 497     {
 498       if (dump_file)
 499         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 500       return;
 501     }
 502
 503   /* Do not peel cold areas.  */
 504   if (optimize_loop_for_size_p (loop))
 505     {
 506       if (dump_file)
 507         fprintf (dump_file, ";; Not considering loop, cold area\n");
 508       return;
 509     }
 510
 511   /* Can the loop be manipulated?  */
 512   if (!can_duplicate_loop_p (loop))
 513     {
 514       if (dump_file)
 515         fprintf (dump_file,
 516                  ";; Not considering loop, cannot duplicate\n");
 517       return;
 518     }
 519
 520   /* npeel = number of iterations to peel.  */
 521   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 522   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 523     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 524
 525   /* Is the loop small enough?  */
 526   if (!npeel)
 527     {
 528       if (dump_file)
 529         fprintf (dump_file, ";; Not considering loop, is too big\n");
 530       return;
 531     }
 532
 533   /* Check for simple loops.  */
 534   desc = get_simple_loop_desc (loop);
 535
 536   /* Check number of iterations.  */
 537   if (!desc->simple_p
 538       || desc->assumptions
 539       || !desc->const_iter
 540       || desc->infinite)
 541     {
 542       if (dump_file)
 543         fprintf (dump_file,
 544                  ";; Unable to prove that the loop iterates constant times\n");
 545       return;
 546     }
 547
 548   if (desc->niter > npeel - 1)
 549     {
 550       if (dump_file)
 551         {
 552           fprintf (dump_file,
 553                    ";; Not peeling loop completely, rolls too much (");
 554           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 555           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 556         }
 557       return;
 558     }
 559
 560   /* Success.  */
 561   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 562 }
 563
 564 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 565    completely.  The transformation done:
 566
 567    for (i = 0; i < 4; i++)
 568      body;
 569
 570    ==>
 571
 572    i = 0;
 573    body; i++;
 574    body; i++;
 575    body; i++;
 576    body; i++;
 577    */
 578 static void
 579 peel_loop_completely (struct loop *loop)
 580 {
 581   sbitmap wont_exit;
 582   unsigned HOST_WIDE_INT npeel;
 583   unsigned i;
 584   vec<edge> remove_edges;
 585   edge ein;
 586   struct niter_desc *desc = get_simple_loop_desc (loop);
 587   struct opt_info *opt_info = NULL;
 588
 589   npeel = desc->niter;
 590
 591   if (npeel)
 592     {
 593       bool ok;
 594
 595       wont_exit = sbitmap_alloc (npeel + 1);
 596       bitmap_ones (wont_exit);
 597       bitmap_clear_bit (wont_exit, 0);
 598       if (desc->noloop_assumptions)
 599         bitmap_clear_bit (wont_exit, 1);
 600
 601       remove_edges.create (0);
 602
 603       if (flag_split_ivs_in_unroller)
 604         opt_info = analyze_insns_in_loop (loop);
 605
 606       opt_info_start_duplication (opt_info);
 607       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 608                                           npeel,
 609                                           wont_exit, desc->out_edge,
 610                                           &remove_edges,
 611                                           DLTHE_FLAG_UPDATE_FREQ
 612                                           | DLTHE_FLAG_COMPLETTE_PEEL
 613                                           | (opt_info
 614                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 615       gcc_assert (ok);
 616
 617       free (wont_exit);
 618
 619       if (opt_info)
 620         {
 621           apply_opt_in_copies (opt_info, npeel, false, true);
 622           free_opt_info (opt_info);
 623         }
 624
 625       /* Remove the exit edges.  */
 626       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 627         remove_path (ein);
 628       remove_edges.release ();
 629     }
 630
 631   ein = desc->in_edge;
 632   free_simple_loop_desc (loop);
 633
 634   /* Now remove the unreachable part of the last iteration and cancel
 635      the loop.  */
 636   remove_path (ein);
 637
 638   if (dump_file)
 639     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 640 }
 641
 642 /* Decide whether to unroll LOOP iterating constant number of times
 643    and how much.  */
 644
 645 static void
 646 decide_unroll_constant_iterations (struct loop *loop, int flags)
 647 {
 648   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 649   struct niter_desc *desc;
 650   double_int iterations;
 651
 652   if (!(flags & UAP_UNROLL))
 653     {
 654       /* We were not asked to, just return back silently.  */
 655       return;
 656     }
 657
 658   if (dump_file)
 659     fprintf (dump_file,
 660              "\n;; Considering unrolling loop with constant "
 661              "number of iterations\n");
 662
 663   /* nunroll = total number of copies of the original loop body in
 664      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 665   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 666   nunroll_by_av
 667     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 668   if (nunroll > nunroll_by_av)
 669     nunroll = nunroll_by_av;
 670   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 671     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 672
 673   /* Skip big loops.  */
 674   if (nunroll <= 1)
 675     {
 676       if (dump_file)
 677         fprintf (dump_file, ";; Not considering loop, is too big\n");
 678       return;
 679     }
 680
 681   /* Check for simple loops.  */
 682   desc = get_simple_loop_desc (loop);
 683
 684   /* Check number of iterations.  */
 685   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 686     {
 687       if (dump_file)
 688         fprintf (dump_file,
 689                  ";; Unable to prove that the loop iterates constant times\n");
 690       return;
 691     }
 692
 693   /* Check whether the loop rolls enough to consider.
 694      Consult also loop bounds and profile; in the case the loop has more
 695      than one exit it may well loop less than determined maximal number
 696      of iterations.  */
 697   if (desc->niter < 2 * nunroll
 698       || ((get_estimated_loop_iterations (loop, &iterations)
 699            || get_max_loop_iterations (loop, &iterations))
 700           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 701     {
 702       if (dump_file)
 703         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 704       return;
 705     }
 706
 707   /* Success; now compute number of iterations to unroll.  We alter
 708      nunroll so that as few as possible copies of loop body are
 709      necessary, while still not decreasing the number of unrollings
 710      too much (at most by 1).  */
 711   best_copies = 2 * nunroll + 10;
 712
 713   i = 2 * nunroll + 2;
 714   if (i - 1 >= desc->niter)
 715     i = desc->niter - 2;
 716
 717   for (; i >= nunroll - 1; i--)
 718     {
 719       unsigned exit_mod = desc->niter % (i + 1);
 720
 721       if (!loop_exit_at_end_p (loop))
 722         n_copies = exit_mod + i + 1;
 723       else if (exit_mod != (unsigned) i
 724                || desc->noloop_assumptions != NULL_RTX)
 725         n_copies = exit_mod + i + 2;
 726       else
 727         n_copies = i + 1;
 728
 729       if (n_copies < best_copies)
 730         {
 731           best_copies = n_copies;
 732           best_unroll = i;
 733         }
 734     }
 735
 736   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 737   loop->lpt_decision.times = best_unroll;
 738 }
 739
 740 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 741    The transformation does this:
 742
 743    for (i = 0; i < 102; i++)
 744      body;
 745
 746    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 747
 748    i = 0;
 749    body; i++;
 750    body; i++;
 751    while (i < 102)
 752      {
 753        body; i++;
 754        body; i++;
 755        body; i++;
 756        body; i++;
 757      }
 758   */
 759 static void
 760 unroll_loop_constant_iterations (struct loop *loop)
 761 {
 762   unsigned HOST_WIDE_INT niter;
 763   unsigned exit_mod;
 764   sbitmap wont_exit;
 765   unsigned i;
 766   vec<edge> remove_edges;
 767   edge e;
 768   unsigned max_unroll = loop->lpt_decision.times;
 769   struct niter_desc *desc = get_simple_loop_desc (loop);
 770   bool exit_at_end = loop_exit_at_end_p (loop);
 771   struct opt_info *opt_info = NULL;
 772   bool ok;
 773
 774   niter = desc->niter;
 775
 776   /* Should not get here (such loop should be peeled instead).  */
 777   gcc_assert (niter > max_unroll + 1);
 778
 779   exit_mod = niter % (max_unroll + 1);
 780
 781   wont_exit = sbitmap_alloc (max_unroll + 1);
 782   bitmap_ones (wont_exit);
 783
 784   remove_edges.create (0);
 785   if (flag_split_ivs_in_unroller
 786       || flag_variable_expansion_in_unroller)
 787     opt_info = analyze_insns_in_loop (loop);
 788
 789   if (!exit_at_end)
 790     {
 791       /* The exit is not at the end of the loop; leave exit test
 792          in the first copy, so that the loops that start with test
 793          of exit condition have continuous body after unrolling.  */
 794
 795       if (dump_file)
 796         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 797
 798       /* Peel exit_mod iterations.  */
 799       bitmap_clear_bit (wont_exit, 0);
 800       if (desc->noloop_assumptions)
 801         bitmap_clear_bit (wont_exit, 1);
 802
 803       if (exit_mod)
 804         {
 805           opt_info_start_duplication (opt_info);
 806           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 807                                               exit_mod,
 808                                               wont_exit, desc->out_edge,
 809                                               &remove_edges,
 810                                               DLTHE_FLAG_UPDATE_FREQ
 811                                               | (opt_info && exit_mod > 1
 812                                                  ? DLTHE_RECORD_COPY_NUMBER
 813                                                    : 0));
 814           gcc_assert (ok);
 815
 816           if (opt_info && exit_mod > 1)
 817             apply_opt_in_copies (opt_info, exit_mod, false, false);
 818
 819           desc->noloop_assumptions = NULL_RTX;
 820           desc->niter -= exit_mod;
 821           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 822           if (loop->any_estimate
 823               && double_int::from_uhwi (exit_mod).ule
 824                    (loop->nb_iterations_estimate))
 825             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 826           else
 827             loop->any_estimate = false;
 828         }
 829
 830       bitmap_set_bit (wont_exit, 1);
 831     }
 832   else
 833     {
 834       /* Leave exit test in last copy, for the same reason as above if
 835          the loop tests the condition at the end of loop body.  */
 836
 837       if (dump_file)
 838         fprintf (dump_file, ";; Condition at end of loop.\n");
 839
 840       /* We know that niter >= max_unroll + 2; so we do not need to care of
 841          case when we would exit before reaching the loop.  So just peel
 842          exit_mod + 1 iterations.  */
 843       if (exit_mod != max_unroll
 844           || desc->noloop_assumptions)
 845         {
 846           bitmap_clear_bit (wont_exit, 0);
 847           if (desc->noloop_assumptions)
 848             bitmap_clear_bit (wont_exit, 1);
 849
 850           opt_info_start_duplication (opt_info);
 851           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 852                                               exit_mod + 1,
 853                                               wont_exit, desc->out_edge,
 854                                               &remove_edges,
 855                                               DLTHE_FLAG_UPDATE_FREQ
 856                                               | (opt_info && exit_mod > 0
 857                                                  ? DLTHE_RECORD_COPY_NUMBER
 858                                                    : 0));
 859           gcc_assert (ok);
 860
 861           if (opt_info && exit_mod > 0)
 862             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 863
 864           desc->niter -= exit_mod + 1;
 865           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 866           if (loop->any_estimate
 867               && double_int::from_uhwi (exit_mod + 1).ule
 868                    (loop->nb_iterations_estimate))
 869             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 870           else
 871             loop->any_estimate = false;
 872           desc->noloop_assumptions = NULL_RTX;
 873
 874           bitmap_set_bit (wont_exit, 0);
 875           bitmap_set_bit (wont_exit, 1);
 876         }
 877
 878       bitmap_clear_bit (wont_exit, max_unroll);
 879     }
 880
 881   /* Now unroll the loop.  */
 882
 883   opt_info_start_duplication (opt_info);
 884   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 885                                       max_unroll,
 886                                       wont_exit, desc->out_edge,
 887                                       &remove_edges,
 888                                       DLTHE_FLAG_UPDATE_FREQ
 889                                       | (opt_info
 890                                          ? DLTHE_RECORD_COPY_NUMBER
 891                                            : 0));
 892   gcc_assert (ok);
 893
 894   if (opt_info)
 895     {
 896       apply_opt_in_copies (opt_info, max_unroll, true, true);
 897       free_opt_info (opt_info);
 898     }
 899
 900   free (wont_exit);
 901
 902   if (exit_at_end)
 903     {
 904       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 905       /* Find a new in and out edge; they are in the last copy we have made.  */
 906
 907       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 908         {
 909           desc->out_edge = EDGE_SUCC (exit_block, 0);
 910           desc->in_edge = EDGE_SUCC (exit_block, 1);
 911         }
 912       else
 913         {
 914           desc->out_edge = EDGE_SUCC (exit_block, 1);
 915           desc->in_edge = EDGE_SUCC (exit_block, 0);
 916         }
 917     }
 918
 919   desc->niter /= max_unroll + 1;
 920   loop->nb_iterations_upper_bound
 921     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 922                                                                    + 1),
 923                                             TRUNC_DIV_EXPR);
 924   if (loop->any_estimate)
 925     loop->nb_iterations_estimate
 926       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 927                                                                   + 1),
 928                                            TRUNC_DIV_EXPR);
 929   desc->niter_expr = GEN_INT (desc->niter);
 930
 931   /* Remove the edges.  */
 932   FOR_EACH_VEC_ELT (remove_edges, i, e)
 933     remove_path (e);
 934   remove_edges.release ();
 935
 936   if (dump_file)
 937     fprintf (dump_file,
 938              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 939              max_unroll, num_loop_insns (loop));
 940 }
 941
 942 /* Decide whether to unroll LOOP iterating runtime computable number of times
 943    and how much.  */
 944 static void
 945 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 946 {
 947   unsigned nunroll, nunroll_by_av, i;
 948   struct niter_desc *desc;
 949   double_int iterations;
 950
 951   if (!(flags & UAP_UNROLL))
 952     {
 953       /* We were not asked to, just return back silently.  */
 954       return;
 955     }
 956
 957   if (dump_file)
 958     fprintf (dump_file,
 959              "\n;; Considering unrolling loop with runtime "
 960              "computable number of iterations\n");
 961
 962   /* nunroll = total number of copies of the original loop body in
 963      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 964   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 965   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 966   if (nunroll > nunroll_by_av)
 967     nunroll = nunroll_by_av;
 968   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 969     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 970
 971   if (targetm.loop_unroll_adjust)
 972     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 973
 974   /* Skip big loops.  */
 975   if (nunroll <= 1)
 976     {
 977       if (dump_file)
 978         fprintf (dump_file, ";; Not considering loop, is too big\n");
 979       return;
 980     }
 981
 982   /* Check for simple loops.  */
 983   desc = get_simple_loop_desc (loop);
 984
 985   /* Check simpleness.  */
 986   if (!desc->simple_p || desc->assumptions)
 987     {
 988       if (dump_file)
 989         fprintf (dump_file,
 990                  ";; Unable to prove that the number of iterations "
 991                  "can be counted in runtime\n");
 992       return;
 993     }
 994
 995   if (desc->const_iter)
 996     {
 997       if (dump_file)
 998         fprintf (dump_file, ";; Loop iterates constant times\n");
 999       return;
1000     }
1001
1002   /* Check whether the loop rolls.  */
1003   if ((get_estimated_loop_iterations (loop, &iterations)
1004        || get_max_loop_iterations (loop, &iterations))
1005       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1006     {
1007       if (dump_file)
1008         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1009       return;
1010     }
1011
1012   /* Success; now force nunroll to be power of 2, as we are unable to
1013      cope with overflows in computation of number of iterations.  */
1014   for (i = 1; 2 * i <= nunroll; i *= 2)
1015     continue;
1016
1017   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1018   loop->lpt_decision.times = i - 1;
1019 }
1020
1021 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1022    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1023    and NULL is returned instead.  */
1024
1025 basic_block
1026 split_edge_and_insert (edge e, rtx insns)
1027 {
1028   basic_block bb;
1029
1030   if (!insns)
1031     return NULL;
1032   bb = split_edge (e);
1033   emit_insn_after (insns, BB_END (bb));
1034
1035   /* ??? We used to assume that INSNS can contain control flow insns, and
1036      that we had to try to find sub basic blocks in BB to maintain a valid
1037      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1038      and call break_superblocks when going out of cfglayout mode.  But it
1039      turns out that this never happens; and that if it does ever happen,
1040      the TODO_verify_flow at the end of the RTL loop passes would fail.
1041
1042      There are two reasons why we expected we could have control flow insns
1043      in INSNS.  The first is when a comparison has to be done in parts, and
1044      the second is when the number of iterations is computed for loops with
1045      the number of iterations known at runtime.  In both cases, test cases
1046      to get control flow in INSNS appear to be impossible to construct:
1047
1048       * If do_compare_rtx_and_jump needs several branches to do comparison
1049         in a mode that needs comparison by parts, we cannot analyze the
1050         number of iterations of the loop, and we never get to unrolling it.
1051
1052       * The code in expand_divmod that was suspected to cause creation of
1053         branching code seems to be only accessed for signed division.  The
1054         divisions used by # of iterations analysis are always unsigned.
1055         Problems might arise on architectures that emits branching code
1056         for some operations that may appear in the unroller (especially
1057         for division), but we have no such architectures.
1058
1059      Considering all this, it was decided that we should for now assume
1060      that INSNS can in theory contain control flow insns, but in practice
1061      it never does.  So we don't handle the theoretical case, and should
1062      a real failure ever show up, we have a pretty good clue for how to
1063      fix it.  */
1064
1065   return bb;
1066 }
1067
1068 /* Unroll LOOP for which we are able to count number of iterations in runtime
1069    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1070    extra care for case n < 0):
1071
1072    for (i = 0; i < n; i++)
1073      body;
1074
1075    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1076
1077    i = 0;
1078    mod = n % 4;
1079
1080    switch (mod)
1081      {
1082        case 3:
1083          body; i++;
1084        case 2:
1085          body; i++;
1086        case 1:
1087          body; i++;
1088        case 0: ;
1089      }
1090
1091    while (i < n)
1092      {
1093        body; i++;
1094        body; i++;
1095        body; i++;
1096        body; i++;
1097      }
1098    */
1099 static void
1100 unroll_loop_runtime_iterations (struct loop *loop)
1101 {
1102   rtx old_niter, niter, init_code, branch_code, tmp;
1103   unsigned i, j, p;
1104   basic_block preheader, *body, swtch, ezc_swtch;
1105   vec<basic_block> dom_bbs;
1106   sbitmap wont_exit;
1107   int may_exit_copy;
1108   unsigned n_peel;
1109   vec<edge> remove_edges;
1110   edge e;
1111   bool extra_zero_check, last_may_exit;
1112   unsigned max_unroll = loop->lpt_decision.times;
1113   struct niter_desc *desc = get_simple_loop_desc (loop);
1114   bool exit_at_end = loop_exit_at_end_p (loop);
1115   struct opt_info *opt_info = NULL;
1116   bool ok;
1117
1118   if (flag_split_ivs_in_unroller
1119       || flag_variable_expansion_in_unroller)
1120     opt_info = analyze_insns_in_loop (loop);
1121
1122   /* Remember blocks whose dominators will have to be updated.  */
1123   dom_bbs.create (0);
1124
1125   body = get_loop_body (loop);
1126   for (i = 0; i < loop->num_nodes; i++)
1127     {
1128       vec<basic_block> ldom;
1129       basic_block bb;
1130
1131       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1132       FOR_EACH_VEC_ELT (ldom, j, bb)
1133         if (!flow_bb_inside_loop_p (loop, bb))
1134           dom_bbs.safe_push (bb);
1135
1136       ldom.release ();
1137     }
1138   free (body);
1139
1140   if (!exit_at_end)
1141     {
1142       /* Leave exit in first copy (for explanation why see comment in
1143          unroll_loop_constant_iterations).  */
1144       may_exit_copy = 0;
1145       n_peel = max_unroll - 1;
1146       extra_zero_check = true;
1147       last_may_exit = false;
1148     }
1149   else
1150     {
1151       /* Leave exit in last copy (for explanation why see comment in
1152          unroll_loop_constant_iterations).  */
1153       may_exit_copy = max_unroll;
1154       n_peel = max_unroll;
1155       extra_zero_check = false;
1156       last_may_exit = true;
1157     }
1158
1159   /* Get expression for number of iterations.  */
1160   start_sequence ();
1161   old_niter = niter = gen_reg_rtx (desc->mode);
1162   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1163   if (tmp != niter)
1164     emit_move_insn (niter, tmp);
1165
1166   /* Count modulo by ANDing it with max_unroll; we use the fact that
1167      the number of unrollings is a power of two, and thus this is correct
1168      even if there is overflow in the computation.  */
1169   niter = expand_simple_binop (desc->mode, AND,
1170                                niter, gen_int_mode (max_unroll, desc->mode),
1171                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1172
1173   init_code = get_insns ();
1174   end_sequence ();
1175   unshare_all_rtl_in_chain (init_code);
1176
1177   /* Precondition the loop.  */
1178   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1179
1180   remove_edges.create (0);
1181
1182   wont_exit = sbitmap_alloc (max_unroll + 2);
1183
1184   /* Peel the first copy of loop body (almost always we must leave exit test
1185      here; the only exception is when we have extra zero check and the number
1186      of iterations is reliable.  Also record the place of (possible) extra
1187      zero check.  */
1188   bitmap_clear (wont_exit);
1189   if (extra_zero_check
1190       && !desc->noloop_assumptions)
1191     bitmap_set_bit (wont_exit, 1);
1192   ezc_swtch = loop_preheader_edge (loop)->src;
1193   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1194                                       1, wont_exit, desc->out_edge,
1195                                       &remove_edges,
1196                                       DLTHE_FLAG_UPDATE_FREQ);
1197   gcc_assert (ok);
1198
1199   /* Record the place where switch will be built for preconditioning.  */
1200   swtch = split_edge (loop_preheader_edge (loop));
1201
1202   for (i = 0; i < n_peel; i++)
1203     {
1204       /* Peel the copy.  */
1205       bitmap_clear (wont_exit);
1206       if (i != n_peel - 1 || !last_may_exit)
1207         bitmap_set_bit (wont_exit, 1);
1208       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1209                                           1, wont_exit, desc->out_edge,
1210                                           &remove_edges,
1211                                           DLTHE_FLAG_UPDATE_FREQ);
1212       gcc_assert (ok);
1213
1214       /* Create item for switch.  */
1215       j = n_peel - i - (extra_zero_check ? 0 : 1);
1216       p = REG_BR_PROB_BASE / (i + 2);
1217
1218       preheader = split_edge (loop_preheader_edge (loop));
1219       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1220                                           block_label (preheader), p,
1221                                           NULL_RTX);
1222
1223       /* We rely on the fact that the compare and jump cannot be optimized out,
1224          and hence the cfg we create is correct.  */
1225       gcc_assert (branch_code != NULL_RTX);
1226
1227       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1228       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1229       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1230       e = make_edge (swtch, preheader,
1231                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1232       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1233       e->probability = p;
1234     }
1235
1236   if (extra_zero_check)
1237     {
1238       /* Add branch for zero iterations.  */
1239       p = REG_BR_PROB_BASE / (max_unroll + 1);
1240       swtch = ezc_swtch;
1241       preheader = split_edge (loop_preheader_edge (loop));
1242       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1243                                           block_label (preheader), p,
1244                                           NULL_RTX);
1245       gcc_assert (branch_code != NULL_RTX);
1246
1247       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1248       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1249       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1250       e = make_edge (swtch, preheader,
1251                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1252       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1253       e->probability = p;
1254     }
1255
1256   /* Recount dominators for outer blocks.  */
1257   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1258
1259   /* And unroll loop.  */
1260
1261   bitmap_ones (wont_exit);
1262   bitmap_clear_bit (wont_exit, may_exit_copy);
1263   opt_info_start_duplication (opt_info);
1264
1265   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1266                                       max_unroll,
1267                                       wont_exit, desc->out_edge,
1268                                       &remove_edges,
1269                                       DLTHE_FLAG_UPDATE_FREQ
1270                                       | (opt_info
1271                                          ? DLTHE_RECORD_COPY_NUMBER
1272                                            : 0));
1273   gcc_assert (ok);
1274
1275   if (opt_info)
1276     {
1277       apply_opt_in_copies (opt_info, max_unroll, true, true);
1278       free_opt_info (opt_info);
1279     }
1280
1281   free (wont_exit);
1282
1283   if (exit_at_end)
1284     {
1285       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1286       /* Find a new in and out edge; they are in the last copy we have
1287          made.  */
1288
1289       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1290         {
1291           desc->out_edge = EDGE_SUCC (exit_block, 0);
1292           desc->in_edge = EDGE_SUCC (exit_block, 1);
1293         }
1294       else
1295         {
1296           desc->out_edge = EDGE_SUCC (exit_block, 1);
1297           desc->in_edge = EDGE_SUCC (exit_block, 0);
1298         }
1299     }
1300
1301   /* Remove the edges.  */
1302   FOR_EACH_VEC_ELT (remove_edges, i, e)
1303     remove_path (e);
1304   remove_edges.release ();
1305
1306   /* We must be careful when updating the number of iterations due to
1307      preconditioning and the fact that the value must be valid at entry
1308      of the loop.  After passing through the above code, we see that
1309      the correct new number of iterations is this:  */
1310   gcc_assert (!desc->const_iter);
1311   desc->niter_expr =
1312     simplify_gen_binary (UDIV, desc->mode, old_niter,
1313                          gen_int_mode (max_unroll + 1, desc->mode));
1314   loop->nb_iterations_upper_bound
1315     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1316                                                                    + 1),
1317                                             TRUNC_DIV_EXPR);
1318   if (loop->any_estimate)
1319     loop->nb_iterations_estimate
1320       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1321                                                                   + 1),
1322                                            TRUNC_DIV_EXPR);
1323   if (exit_at_end)
1324     {
1325       desc->niter_expr =
1326         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1327       desc->noloop_assumptions = NULL_RTX;
1328       --loop->nb_iterations_upper_bound;
1329       if (loop->any_estimate
1330           && loop->nb_iterations_estimate != double_int_zero)
1331         --loop->nb_iterations_estimate;
1332       else
1333         loop->any_estimate = false;
1334     }
1335
1336   if (dump_file)
1337     fprintf (dump_file,
1338              ";; Unrolled loop %d times, counting # of iterations "
1339              "in runtime, %i insns\n",
1340              max_unroll, num_loop_insns (loop));
1341
1342   dom_bbs.release ();
1343 }
1344
1345 /* Decide whether to simply peel LOOP and how much.  */
1346 static void
1347 decide_peel_simple (struct loop *loop, int flags)
1348 {
1349   unsigned npeel;
1350   double_int iterations;
1351
1352   if (!(flags & UAP_PEEL))
1353     {
1354       /* We were not asked to, just return back silently.  */
1355       return;
1356     }
1357
1358   if (dump_file)
1359     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1360
1361   /* npeel = number of iterations to peel.  */
1362   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1363   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1364     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1365
1366   /* Skip big loops.  */
1367   if (!npeel)
1368     {
1369       if (dump_file)
1370         fprintf (dump_file, ";; Not considering loop, is too big\n");
1371       return;
1372     }
1373
1374   /* Do not simply peel loops with branches inside -- it increases number
1375      of mispredicts.
1376      Exception is when we do have profile and we however have good chance
1377      to peel proper number of iterations loop will iterate in practice.
1378      TODO: this heuristic needs tunning; while for complette unrolling
1379      the branch inside loop mostly eliminates any improvements, for
1380      peeling it is not the case.  Also a function call inside loop is
1381      also branch from branch prediction POV (and probably better reason
1382      to not unroll/peel).  */
1383   if (num_loop_branches (loop) > 1
1384       && profile_status != PROFILE_READ)
1385     {
1386       if (dump_file)
1387         fprintf (dump_file, ";; Not peeling, contains branches\n");
1388       return;
1389     }
1390
1391   /* If we have realistic estimate on number of iterations, use it.  */
1392   if (get_estimated_loop_iterations (loop, &iterations))
1393     {
1394       if (double_int::from_shwi (npeel).ule (iterations))
1395         {
1396           if (dump_file)
1397             {
1398               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1399               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1400                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1401               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1402                        npeel);
1403             }
1404           return;
1405         }
1406       npeel = iterations.to_shwi () + 1;
1407     }
1408   /* If we have small enough bound on iterations, we can still peel (completely
1409      unroll).  */
1410   else if (get_max_loop_iterations (loop, &iterations)
1411            && iterations.ult (double_int::from_shwi (npeel)))
1412     npeel = iterations.to_shwi () + 1;
1413   else
1414     {
1415       /* For now we have no good heuristics to decide whether loop peeling
1416          will be effective, so disable it.  */
1417       if (dump_file)
1418         fprintf (dump_file,
1419                  ";; Not peeling loop, no evidence it will be profitable\n");
1420       return;
1421     }
1422
1423   /* Success.  */
1424   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1425   loop->lpt_decision.times = npeel;
1426 }
1427
1428 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1429
1430    while (cond)
1431      body;
1432
1433    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1434
1435    if (!cond) goto end;
1436    body;
1437    if (!cond) goto end;
1438    body;
1439    if (!cond) goto end;
1440    body;
1441    while (cond)
1442      body;
1443    end: ;
1444    */
1445 static void
1446 peel_loop_simple (struct loop *loop)
1447 {
1448   sbitmap wont_exit;
1449   unsigned npeel = loop->lpt_decision.times;
1450   struct niter_desc *desc = get_simple_loop_desc (loop);
1451   struct opt_info *opt_info = NULL;
1452   bool ok;
1453
1454   if (flag_split_ivs_in_unroller && npeel > 1)
1455     opt_info = analyze_insns_in_loop (loop);
1456
1457   wont_exit = sbitmap_alloc (npeel + 1);
1458   bitmap_clear (wont_exit);
1459
1460   opt_info_start_duplication (opt_info);
1461
1462   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1463                                       npeel, wont_exit, NULL,
1464                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1465                                       | (opt_info
1466                                          ? DLTHE_RECORD_COPY_NUMBER
1467                                            : 0));
1468   gcc_assert (ok);
1469
1470   free (wont_exit);
1471
1472   if (opt_info)
1473     {
1474       apply_opt_in_copies (opt_info, npeel, false, false);
1475       free_opt_info (opt_info);
1476     }
1477
1478   if (desc->simple_p)
1479     {
1480       if (desc->const_iter)
1481         {
1482           desc->niter -= npeel;
1483           desc->niter_expr = GEN_INT (desc->niter);
1484           desc->noloop_assumptions = NULL_RTX;
1485         }
1486       else
1487         {
1488           /* We cannot just update niter_expr, as its value might be clobbered
1489              inside loop.  We could handle this by counting the number into
1490              temporary just like we do in runtime unrolling, but it does not
1491              seem worthwhile.  */
1492           free_simple_loop_desc (loop);
1493         }
1494     }
1495   if (dump_file)
1496     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1497 }
1498
1499 /* Decide whether to unroll LOOP stupidly and how much.  */
1500 static void
1501 decide_unroll_stupid (struct loop *loop, int flags)
1502 {
1503   unsigned nunroll, nunroll_by_av, i;
1504   struct niter_desc *desc;
1505   double_int iterations;
1506
1507   if (!(flags & UAP_UNROLL_ALL))
1508     {
1509       /* We were not asked to, just return back silently.  */
1510       return;
1511     }
1512
1513   if (dump_file)
1514     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1515
1516   /* nunroll = total number of copies of the original loop body in
1517      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1518   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1519   nunroll_by_av
1520     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1521   if (nunroll > nunroll_by_av)
1522     nunroll = nunroll_by_av;
1523   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1524     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1525
1526   if (targetm.loop_unroll_adjust)
1527     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1528
1529   /* Skip big loops.  */
1530   if (nunroll <= 1)
1531     {
1532       if (dump_file)
1533         fprintf (dump_file, ";; Not considering loop, is too big\n");
1534       return;
1535     }
1536
1537   /* Check for simple loops.  */
1538   desc = get_simple_loop_desc (loop);
1539
1540   /* Check simpleness.  */
1541   if (desc->simple_p && !desc->assumptions)
1542     {
1543       if (dump_file)
1544         fprintf (dump_file, ";; The loop is simple\n");
1545       return;
1546     }
1547
1548   /* Do not unroll loops with branches inside -- it increases number
1549      of mispredicts.
1550      TODO: this heuristic needs tunning; call inside the loop body
1551      is also relatively good reason to not unroll.  */
1552   if (num_loop_branches (loop) > 1)
1553     {
1554       if (dump_file)
1555         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1556       return;
1557     }
1558
1559   /* Check whether the loop rolls.  */
1560   if ((get_estimated_loop_iterations (loop, &iterations)
1561        || get_max_loop_iterations (loop, &iterations))
1562       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1563     {
1564       if (dump_file)
1565         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1566       return;
1567     }
1568
1569   /* Success.  Now force nunroll to be power of 2, as it seems that this
1570      improves results (partially because of better alignments, partially
1571      because of some dark magic).  */
1572   for (i = 1; 2 * i <= nunroll; i *= 2)
1573     continue;
1574
1575   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1576   loop->lpt_decision.times = i - 1;
1577 }
1578
1579 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1580
1581    while (cond)
1582      body;
1583
1584    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1585
1586    while (cond)
1587      {
1588        body;
1589        if (!cond) break;
1590        body;
1591        if (!cond) break;
1592        body;
1593        if (!cond) break;
1594        body;
1595      }
1596    */
1597 static void
1598 unroll_loop_stupid (struct loop *loop)
1599 {
1600   sbitmap wont_exit;
1601   unsigned nunroll = loop->lpt_decision.times;
1602   struct niter_desc *desc = get_simple_loop_desc (loop);
1603   struct opt_info *opt_info = NULL;
1604   bool ok;
1605
1606   if (flag_split_ivs_in_unroller
1607       || flag_variable_expansion_in_unroller)
1608     opt_info = analyze_insns_in_loop (loop);
1609
1610
1611   wont_exit = sbitmap_alloc (nunroll + 1);
1612   bitmap_clear (wont_exit);
1613   opt_info_start_duplication (opt_info);
1614
1615   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1616                                       nunroll, wont_exit,
1617                                       NULL, NULL,
1618                                       DLTHE_FLAG_UPDATE_FREQ
1619                                       | (opt_info
1620                                          ? DLTHE_RECORD_COPY_NUMBER
1621                                            : 0));
1622   gcc_assert (ok);
1623
1624   if (opt_info)
1625     {
1626       apply_opt_in_copies (opt_info, nunroll, true, true);
1627       free_opt_info (opt_info);
1628     }
1629
1630   free (wont_exit);
1631
1632   if (desc->simple_p)
1633     {
1634       /* We indeed may get here provided that there are nontrivial assumptions
1635          for a loop to be really simple.  We could update the counts, but the
1636          problem is that we are unable to decide which exit will be taken
1637          (not really true in case the number of iterations is constant,
1638          but no one will do anything with this information, so we do not
1639          worry about it).  */
1640       desc->simple_p = false;
1641     }
1642
1643   if (dump_file)
1644     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1645              nunroll, num_loop_insns (loop));
1646 }
1647
1648 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1649    Set *DEBUG_USES to the number of debug insns that reference the
1650    variable.  */
1651
1652 bool
1653 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1654                                   int *debug_uses)
1655 {
1656   basic_block *body, bb;
1657   unsigned i;
1658   int count_ref = 0;
1659   rtx insn;
1660
1661   body = get_loop_body (loop);
1662   for (i = 0; i < loop->num_nodes; i++)
1663     {
1664       bb = body[i];
1665
1666       FOR_BB_INSNS (bb, insn)
1667         if (!rtx_referenced_p (reg, insn))
1668           continue;
1669         else if (DEBUG_INSN_P (insn))
1670           ++*debug_uses;
1671         else if (++count_ref > 1)
1672           break;
1673     }
1674   free (body);
1675   return (count_ref  == 1);
1676 }
1677
1678 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1679
1680 static void
1681 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1682 {
1683   basic_block *body, bb;
1684   unsigned i;
1685   rtx insn;
1686
1687   body = get_loop_body (loop);
1688   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1689     {
1690       bb = body[i];
1691
1692       FOR_BB_INSNS (bb, insn)
1693         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1694           continue;
1695         else
1696           {
1697             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1698                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1699             if (!--debug_uses)
1700               break;
1701           }
1702     }
1703   free (body);
1704 }
1705
1706 /* Determine whether INSN contains an accumulator
1707    which can be expanded into separate copies,
1708    one for each copy of the LOOP body.
1709
1710    for (i = 0 ; i < n; i++)
1711      sum += a[i];
1712
1713    ==>
1714
1715    sum += a[i]
1716    ....
1717    i = i+1;
1718    sum1 += a[i]
1719    ....
1720    i = i+1
1721    sum2 += a[i];
1722    ....
1723
1724    Return NULL if INSN contains no opportunity for expansion of accumulator.
1725    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1726    information and return a pointer to it.
1727 */
1728
1729 static struct var_to_expand *
1730 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1731 {
1732   rtx set, dest, src;
1733   struct var_to_expand *ves;
1734   unsigned accum_pos;
1735   enum rtx_code code;
1736   int debug_uses = 0;
1737
1738   set = single_set (insn);
1739   if (!set)
1740     return NULL;
1741
1742   dest = SET_DEST (set);
1743   src = SET_SRC (set);
1744   code = GET_CODE (src);
1745
1746   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1747     return NULL;
1748
1749   if (FLOAT_MODE_P (GET_MODE (dest)))
1750     {
1751       if (!flag_associative_math)
1752         return NULL;
1753       /* In the case of FMA, we're also changing the rounding.  */
1754       if (code == FMA && !flag_unsafe_math_optimizations)
1755         return NULL;
1756     }
1757
1758   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1759      in MD.  But if there is no optab to generate the insn, we can not
1760      perform the variable expansion.  This can happen if an MD provides
1761      an insn but not a named pattern to generate it, for example to avoid
1762      producing code that needs additional mode switches like for x87/mmx.
1763
1764      So we check have_insn_for which looks for an optab for the operation
1765      in SRC.  If it doesn't exist, we can't perform the expansion even
1766      though INSN is valid.  */
1767   if (!have_insn_for (code, GET_MODE (src)))
1768     return NULL;
1769
1770   if (!REG_P (dest)
1771       && !(GET_CODE (dest) == SUBREG
1772            && REG_P (SUBREG_REG (dest))))
1773     return NULL;
1774
1775   /* Find the accumulator use within the operation.  */
1776   if (code == FMA)
1777     {
1778       /* We only support accumulation via FMA in the ADD position.  */
1779       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1780         return NULL;
1781       accum_pos = 2;
1782     }
1783   else if (rtx_equal_p (dest, XEXP (src, 0)))
1784     accum_pos = 0;
1785   else if (rtx_equal_p (dest, XEXP (src, 1)))
1786     {
1787       /* The method of expansion that we are using; which includes the
1788          initialization of the expansions with zero and the summation of
1789          the expansions at the end of the computation will yield wrong
1790          results for (x = something - x) thus avoid using it in that case.  */
1791       if (code == MINUS)
1792         return NULL;
1793       accum_pos = 1;
1794     }
1795   else
1796     return NULL;
1797
1798   /* It must not otherwise be used.  */
1799   if (code == FMA)
1800     {
1801       if (rtx_referenced_p (dest, XEXP (src, 0))
1802           || rtx_referenced_p (dest, XEXP (src, 1)))
1803         return NULL;
1804     }
1805   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1806     return NULL;
1807
1808   /* It must be used in exactly one insn.  */
1809   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1810     return NULL;
1811
1812   if (dump_file)
1813     {
1814       fprintf (dump_file, "\n;; Expanding Accumulator ");
1815       print_rtl (dump_file, dest);
1816       fprintf (dump_file, "\n");
1817     }
1818
1819   if (debug_uses)
1820     /* Instead of resetting the debug insns, we could replace each
1821        debug use in the loop with the sum or product of all expanded
1822        accummulators.  Since we'll only know of all expansions at the
1823        end, we'd have to keep track of which vars_to_expand a debug
1824        insn in the loop references, take note of each copy of the
1825        debug insn during unrolling, and when it's all done, compute
1826        the sum or product of each variable and adjust the original
1827        debug insn and each copy thereof.  What a pain!  */
1828     reset_debug_uses_in_loop (loop, dest, debug_uses);
1829
1830   /* Record the accumulator to expand.  */
1831   ves = XNEW (struct var_to_expand);
1832   ves->insn = insn;
1833   ves->reg = copy_rtx (dest);
1834   ves->var_expansions.create (1);
1835   ves->next = NULL;
1836   ves->op = GET_CODE (src);
1837   ves->expansion_count = 0;
1838   ves->reuse_expansion = 0;
1839   return ves;
1840 }
1841
1842 /* Determine whether there is an induction variable in INSN that
1843    we would like to split during unrolling.
1844
1845    I.e. replace
1846
1847    i = i + 1;
1848    ...
1849    i = i + 1;
1850    ...
1851    i = i + 1;
1852    ...
1853
1854    type chains by
1855
1856    i0 = i + 1
1857    ...
1858    i = i0 + 1
1859    ...
1860    i = i0 + 2
1861    ...
1862
1863    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1864    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1865    pointer to it.  */
1866
1867 static struct iv_to_split *
1868 analyze_iv_to_split_insn (rtx insn)
1869 {
1870   rtx set, dest;
1871   struct rtx_iv iv;
1872   struct iv_to_split *ivts;
1873   bool ok;
1874
1875   /* For now we just split the basic induction variables.  Later this may be
1876      extended for example by selecting also addresses of memory references.  */
1877   set = single_set (insn);
1878   if (!set)
1879     return NULL;
1880
1881   dest = SET_DEST (set);
1882   if (!REG_P (dest))
1883     return NULL;
1884
1885   if (!biv_p (insn, dest))
1886     return NULL;
1887
1888   ok = iv_analyze_result (insn, dest, &iv);
1889
1890   /* This used to be an assert under the assumption that if biv_p returns
1891      true that iv_analyze_result must also return true.  However, that
1892      assumption is not strictly correct as evidenced by pr25569.
1893
1894      Returning NULL when iv_analyze_result returns false is safe and
1895      avoids the problems in pr25569 until the iv_analyze_* routines
1896      can be fixed, which is apparently hard and time consuming
1897      according to their author.  */
1898   if (! ok)
1899     return NULL;
1900
1901   if (iv.step == const0_rtx
1902       || iv.mode != iv.extend_mode)
1903     return NULL;
1904
1905   /* Record the insn to split.  */
1906   ivts = XNEW (struct iv_to_split);
1907   ivts->insn = insn;
1908   ivts->orig_var = dest;
1909   ivts->base_var = NULL_RTX;
1910   ivts->step = iv.step;
1911   ivts->next = NULL;
1912   ivts->n_loc = 1;
1913   ivts->loc[0] = 1;
1914
1915   return ivts;
1916 }
1917
1918 /* Determines which of insns in LOOP can be optimized.
1919    Return a OPT_INFO struct with the relevant hash tables filled
1920    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1921    is undefined for the return value.  */
1922
1923 static struct opt_info *
1924 analyze_insns_in_loop (struct loop *loop)
1925 {
1926   basic_block *body, bb;
1927   unsigned i;
1928   struct opt_info *opt_info = XCNEW (struct opt_info);
1929   rtx insn;
1930   struct iv_to_split *ivts = NULL;
1931   struct var_to_expand *ves = NULL;
1932   iv_to_split **slot1;
1933   var_to_expand **slot2;
1934   vec<edge> edges = get_loop_exit_edges (loop);
1935   edge exit;
1936   bool can_apply = false;
1937
1938   iv_analysis_loop_init (loop);
1939
1940   body = get_loop_body (loop);
1941
1942   if (flag_split_ivs_in_unroller)
1943     {
1944       opt_info->insns_to_split.create (5 * loop->num_nodes);
1945       opt_info->iv_to_split_head = NULL;
1946       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1947     }
1948
1949   /* Record the loop exit bb and loop preheader before the unrolling.  */
1950   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1951
1952   if (edges.length () == 1)
1953     {
1954       exit = edges[0];
1955       if (!(exit->flags & EDGE_COMPLEX))
1956         {
1957           opt_info->loop_exit = split_edge (exit);
1958           can_apply = true;
1959         }
1960     }
1961
1962   if (flag_variable_expansion_in_unroller
1963       && can_apply)
1964     {
1965       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1966       opt_info->var_to_expand_head = NULL;
1967       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1968     }
1969
1970   for (i = 0; i < loop->num_nodes; i++)
1971     {
1972       bb = body[i];
1973       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1974         continue;
1975
1976       FOR_BB_INSNS (bb, insn)
1977       {
1978         if (!INSN_P (insn))
1979           continue;
1980
1981         if (opt_info->insns_to_split.is_created ())
1982           ivts = analyze_iv_to_split_insn (insn);
1983
1984         if (ivts)
1985           {
1986             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1987             gcc_assert (*slot1 == NULL);
1988             *slot1 = ivts;
1989             *opt_info->iv_to_split_tail = ivts;
1990             opt_info->iv_to_split_tail = &ivts->next;
1991             continue;
1992           }
1993
1994         if (opt_info->insns_with_var_to_expand.is_created ())
1995           ves = analyze_insn_to_expand_var (loop, insn);
1996
1997         if (ves)
1998           {
1999             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
2000             gcc_assert (*slot2 == NULL);
2001             *slot2 = ves;
2002             *opt_info->var_to_expand_tail = ves;
2003             opt_info->var_to_expand_tail = &ves->next;
2004           }
2005       }
2006     }
2007
2008   edges.release ();
2009   free (body);
2010   return opt_info;
2011 }
2012
2013 /* Called just before loop duplication.  Records start of duplicated area
2014    to OPT_INFO.  */
2015
2016 static void
2017 opt_info_start_duplication (struct opt_info *opt_info)
2018 {
2019   if (opt_info)
2020     opt_info->first_new_block = last_basic_block;
2021 }
2022
2023 /* Determine the number of iterations between initialization of the base
2024    variable and the current copy (N_COPY).  N_COPIES is the total number
2025    of newly created copies.  UNROLLING is true if we are unrolling
2026    (not peeling) the loop.  */
2027
2028 static unsigned
2029 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2030 {
2031   if (unrolling)
2032     {
2033       /* If we are unrolling, initialization is done in the original loop
2034          body (number 0).  */
2035       return n_copy;
2036     }
2037   else
2038     {
2039       /* If we are peeling, the copy in that the initialization occurs has
2040          number 1.  The original loop (number 0) is the last.  */
2041       if (n_copy)
2042         return n_copy - 1;
2043       else
2044         return n_copies;
2045     }
2046 }
2047
2048 /* Locate in EXPR the expression corresponding to the location recorded
2049    in IVTS, and return a pointer to the RTX for this location.  */
2050
2051 static rtx *
2052 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2053 {
2054   unsigned i;
2055   rtx *ret = &expr;
2056
2057   for (i = 0; i < ivts->n_loc; i++)
2058     ret = &XEXP (*ret, ivts->loc[i]);
2059
2060   return ret;
2061 }
2062
2063 /* Allocate basic variable for the induction variable chain.  */
2064
2065 static void
2066 allocate_basic_variable (struct iv_to_split *ivts)
2067 {
2068   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2069
2070   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2071 }
2072
2073 /* Insert initialization of basic variable of IVTS before INSN, taking
2074    the initial value from INSN.  */
2075
2076 static void
2077 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2078 {
2079   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2080   rtx seq;
2081
2082   start_sequence ();
2083   expr = force_operand (expr, ivts->base_var);
2084   if (expr != ivts->base_var)
2085     emit_move_insn (ivts->base_var, expr);
2086   seq = get_insns ();
2087   end_sequence ();
2088
2089   emit_insn_before (seq, insn);
2090 }
2091
2092 /* Replace the use of induction variable described in IVTS in INSN
2093    by base variable + DELTA * step.  */
2094
2095 static void
2096 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2097 {
2098   rtx expr, *loc, seq, incr, var;
2099   enum machine_mode mode = GET_MODE (ivts->base_var);
2100   rtx src, dest, set;
2101
2102   /* Construct base + DELTA * step.  */
2103   if (!delta)
2104     expr = ivts->base_var;
2105   else
2106     {
2107       incr = simplify_gen_binary (MULT, mode,
2108                                   ivts->step, gen_int_mode (delta, mode));
2109       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2110                                   ivts->base_var, incr);
2111     }
2112
2113   /* Figure out where to do the replacement.  */
2114   loc = get_ivts_expr (single_set (insn), ivts);
2115
2116   /* If we can make the replacement right away, we're done.  */
2117   if (validate_change (insn, loc, expr, 0))
2118     return;
2119
2120   /* Otherwise, force EXPR into a register and try again.  */
2121   start_sequence ();
2122   var = gen_reg_rtx (mode);
2123   expr = force_operand (expr, var);
2124   if (expr != var)
2125     emit_move_insn (var, expr);
2126   seq = get_insns ();
2127   end_sequence ();
2128   emit_insn_before (seq, insn);
2129
2130   if (validate_change (insn, loc, var, 0))
2131     return;
2132
2133   /* The last chance.  Try recreating the assignment in insn
2134      completely from scratch.  */
2135   set = single_set (insn);
2136   gcc_assert (set);
2137
2138   start_sequence ();
2139   *loc = var;
2140   src = copy_rtx (SET_SRC (set));
2141   dest = copy_rtx (SET_DEST (set));
2142   src = force_operand (src, dest);
2143   if (src != dest)
2144     emit_move_insn (dest, src);
2145   seq = get_insns ();
2146   end_sequence ();
2147
2148   emit_insn_before (seq, insn);
2149   delete_insn (insn);
2150 }
2151
2152
2153 /* Return one expansion of the accumulator recorded in struct VE.  */
2154
2155 static rtx
2156 get_expansion (struct var_to_expand *ve)
2157 {
2158   rtx reg;
2159
2160   if (ve->reuse_expansion == 0)
2161     reg = ve->reg;
2162   else
2163     reg = ve->var_expansions[ve->reuse_expansion - 1];
2164
2165   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2166     ve->reuse_expansion = 0;
2167   else
2168     ve->reuse_expansion++;
2169
2170   return reg;
2171 }
2172
2173
2174 /* Given INSN replace the uses of the accumulator recorded in VE
2175    with a new register.  */
2176
2177 static void
2178 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2179 {
2180   rtx new_reg, set;
2181   bool really_new_expansion = false;
2182
2183   set = single_set (insn);
2184   gcc_assert (set);
2185
2186   /* Generate a new register only if the expansion limit has not been
2187      reached.  Else reuse an already existing expansion.  */
2188   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2189     {
2190       really_new_expansion = true;
2191       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2192     }
2193   else
2194     new_reg = get_expansion (ve);
2195
2196   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2197   if (apply_change_group ())
2198     if (really_new_expansion)
2199       {
2200         ve->var_expansions.safe_push (new_reg);
2201         ve->expansion_count++;
2202       }
2203 }
2204
2205 /* Initialize the variable expansions in loop preheader.  PLACE is the
2206    loop-preheader basic block where the initialization of the
2207    expansions should take place.  The expansions are initialized with
2208    (-0) when the operation is plus or minus to honor sign zero.  This
2209    way we can prevent cases where the sign of the final result is
2210    effected by the sign of the expansion.  Here is an example to
2211    demonstrate this:
2212
2213    for (i = 0 ; i < n; i++)
2214      sum += something;
2215
2216    ==>
2217
2218    sum += something
2219    ....
2220    i = i+1;
2221    sum1 += something
2222    ....
2223    i = i+1
2224    sum2 += something;
2225    ....
2226
2227    When SUM is initialized with -zero and SOMETHING is also -zero; the
2228    final result of sum should be -zero thus the expansions sum1 and sum2
2229    should be initialized with -zero as well (otherwise we will get +zero
2230    as the final result).  */
2231
2232 static void
2233 insert_var_expansion_initialization (struct var_to_expand *ve,
2234                                      basic_block place)
2235 {
2236   rtx seq, var, zero_init;
2237   unsigned i;
2238   enum machine_mode mode = GET_MODE (ve->reg);
2239   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2240
2241   if (ve->var_expansions.length () == 0)
2242     return;
2243
2244   start_sequence ();
2245   switch (ve->op)
2246     {
2247     case FMA:
2248       /* Note that we only accumulate FMA via the ADD operand.  */
2249     case PLUS:
2250     case MINUS:
2251       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2252         {
2253           if (honor_signed_zero_p)
2254             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2255           else
2256             zero_init = CONST0_RTX (mode);
2257           emit_move_insn (var, zero_init);
2258         }
2259       break;
2260
2261     case MULT:
2262       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2263         {
2264           zero_init = CONST1_RTX (GET_MODE (var));
2265           emit_move_insn (var, zero_init);
2266         }
2267       break;
2268
2269     default:
2270       gcc_unreachable ();
2271     }
2272
2273   seq = get_insns ();
2274   end_sequence ();
2275
2276   emit_insn_after (seq, BB_END (place));
2277 }
2278
2279 /* Combine the variable expansions at the loop exit.  PLACE is the
2280    loop exit basic block where the summation of the expansions should
2281    take place.  */
2282
2283 static void
2284 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2285 {
2286   rtx sum = ve->reg;
2287   rtx expr, seq, var, insn;
2288   unsigned i;
2289
2290   if (ve->var_expansions.length () == 0)
2291     return;
2292
2293   start_sequence ();
2294   switch (ve->op)
2295     {
2296     case FMA:
2297       /* Note that we only accumulate FMA via the ADD operand.  */
2298     case PLUS:
2299     case MINUS:
2300       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2301         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2302       break;
2303
2304     case MULT:
2305       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2306         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2307       break;
2308
2309     default:
2310       gcc_unreachable ();
2311     }
2312
2313   expr = force_operand (sum, ve->reg);
2314   if (expr != ve->reg)
2315     emit_move_insn (ve->reg, expr);
2316   seq = get_insns ();
2317   end_sequence ();
2318
2319   insn = BB_HEAD (place);
2320   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2321     insn = NEXT_INSN (insn);
2322
2323   emit_insn_after (seq, insn);
2324 }
2325
2326 /* Strip away REG_EQUAL notes for IVs we're splitting.
2327
2328    Updating REG_EQUAL notes for IVs we split is tricky: We
2329    cannot tell until after unrolling, DF-rescanning, and liveness
2330    updating, whether an EQ_USE is reached by the split IV while
2331    the IV reg is still live.  See PR55006.
2332
2333    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2334    because RTL loop-iv requires us to defer rescanning insns and
2335    any notes attached to them.  So resort to old techniques...  */
2336
2337 static void
2338 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2339 {
2340   struct iv_to_split *ivts;
2341   rtx note = find_reg_equal_equiv_note (insn);
2342   if (! note)
2343     return;
2344   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2345     if (reg_mentioned_p (ivts->orig_var, note))
2346       {
2347         remove_note (insn, note);
2348         return;
2349       }
2350 }
2351
2352 /* Apply loop optimizations in loop copies using the
2353    data which gathered during the unrolling.  Structure
2354    OPT_INFO record that data.
2355
2356    UNROLLING is true if we unrolled (not peeled) the loop.
2357    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2358    the loop (as it should happen in complete unrolling, but not in ordinary
2359    peeling of the loop).  */
2360
2361 static void
2362 apply_opt_in_copies (struct opt_info *opt_info,
2363                      unsigned n_copies, bool unrolling,
2364                      bool rewrite_original_loop)
2365 {
2366   unsigned i, delta;
2367   basic_block bb, orig_bb;
2368   rtx insn, orig_insn, next;
2369   struct iv_to_split ivts_templ, *ivts;
2370   struct var_to_expand ve_templ, *ves;
2371
2372   /* Sanity check -- we need to put initialization in the original loop
2373      body.  */
2374   gcc_assert (!unrolling || rewrite_original_loop);
2375
2376   /* Allocate the basic variables (i0).  */
2377   if (opt_info->insns_to_split.is_created ())
2378     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2379       allocate_basic_variable (ivts);
2380
2381   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2382     {
2383       bb = BASIC_BLOCK (i);
2384       orig_bb = get_bb_original (bb);
2385
2386       /* bb->aux holds position in copy sequence initialized by
2387          duplicate_loop_to_header_edge.  */
2388       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2389                                         unrolling);
2390       bb->aux = 0;
2391       orig_insn = BB_HEAD (orig_bb);
2392       FOR_BB_INSNS_SAFE (bb, insn, next)
2393         {
2394           if (!INSN_P (insn)
2395               || (DEBUG_INSN_P (insn)
2396                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2397             continue;
2398
2399           while (!INSN_P (orig_insn)
2400                  || (DEBUG_INSN_P (orig_insn)
2401                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2402                          == LABEL_DECL)))
2403             orig_insn = NEXT_INSN (orig_insn);
2404
2405           ivts_templ.insn = orig_insn;
2406           ve_templ.insn = orig_insn;
2407
2408           /* Apply splitting iv optimization.  */
2409           if (opt_info->insns_to_split.is_created ())
2410             {
2411               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2412
2413               ivts = opt_info->insns_to_split.find (&ivts_templ);
2414
2415               if (ivts)
2416                 {
2417                   gcc_assert (GET_CODE (PATTERN (insn))
2418                               == GET_CODE (PATTERN (orig_insn)));
2419
2420                   if (!delta)
2421                     insert_base_initialization (ivts, insn);
2422                   split_iv (ivts, insn, delta);
2423                 }
2424             }
2425           /* Apply variable expansion optimization.  */
2426           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2427             {
2428               ves = (struct var_to_expand *)
2429                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2430               if (ves)
2431                 {
2432                   gcc_assert (GET_CODE (PATTERN (insn))
2433                               == GET_CODE (PATTERN (orig_insn)));
2434                   expand_var_during_unrolling (ves, insn);
2435                 }
2436             }
2437           orig_insn = NEXT_INSN (orig_insn);
2438         }
2439     }
2440
2441   if (!rewrite_original_loop)
2442     return;
2443
2444   /* Initialize the variable expansions in the loop preheader
2445      and take care of combining them at the loop exit.  */
2446   if (opt_info->insns_with_var_to_expand.is_created ())
2447     {
2448       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2449         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2450       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2451         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2452     }
2453
2454   /* Rewrite also the original loop body.  Find them as originals of the blocks
2455      in the last copied iteration, i.e. those that have
2456      get_bb_copy (get_bb_original (bb)) == bb.  */
2457   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2458     {
2459       bb = BASIC_BLOCK (i);
2460       orig_bb = get_bb_original (bb);
2461       if (get_bb_copy (orig_bb) != bb)
2462         continue;
2463
2464       delta = determine_split_iv_delta (0, n_copies, unrolling);
2465       for (orig_insn = BB_HEAD (orig_bb);
2466            orig_insn != NEXT_INSN (BB_END (bb));
2467            orig_insn = next)
2468         {
2469           next = NEXT_INSN (orig_insn);
2470
2471           if (!INSN_P (orig_insn))
2472             continue;
2473
2474           ivts_templ.insn = orig_insn;
2475           if (opt_info->insns_to_split.is_created ())
2476             {
2477               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2478
2479               ivts = (struct iv_to_split *)
2480                 opt_info->insns_to_split.find (&ivts_templ);
2481               if (ivts)
2482                 {
2483                   if (!delta)
2484                     insert_base_initialization (ivts, orig_insn);
2485                   split_iv (ivts, orig_insn, delta);
2486                   continue;
2487                 }
2488             }
2489
2490         }
2491     }
2492 }
2493
2494 /* Release OPT_INFO.  */
2495
2496 static void
2497 free_opt_info (struct opt_info *opt_info)
2498 {
2499   if (opt_info->insns_to_split.is_created ())
2500     opt_info->insns_to_split.dispose ();
2501   if (opt_info->insns_with_var_to_expand.is_created ())
2502     {
2503       struct var_to_expand *ves;
2504
2505       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2506         ves->var_expansions.release ();
2507       opt_info->insns_with_var_to_expand.dispose ();
2508     }
2509   free (opt_info);
2510 }