gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Hashtable helper for iv_to_split.  */
 107
 108 struct iv_split_hasher : typed_free_remove <iv_to_split>
 109 {
 110   typedef iv_to_split value_type;
 111   typedef iv_to_split compare_type;
 112   static inline hashval_t hash (const value_type *);
 113   static inline bool equal (const value_type *, const compare_type *);
 114 };
 115
 116
 117 /* A hash function for information about insns to split.  */
 118
 119 inline hashval_t
 120 iv_split_hasher::hash (const value_type *ivts)
 121 {
 122   return (hashval_t) INSN_UID (ivts->insn);
 123 }
 124
 125 /* An equality functions for information about insns to split.  */
 126
 127 inline bool
 128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 129 {
 130   return i1->insn == i2->insn;
 131 }
 132
 133 /* Hashtable helper for iv_to_split.  */
 134
 135 struct var_expand_hasher : typed_free_remove <var_to_expand>
 136 {
 137   typedef var_to_expand value_type;
 138   typedef var_to_expand compare_type;
 139   static inline hashval_t hash (const value_type *);
 140   static inline bool equal (const value_type *, const compare_type *);
 141 };
 142
 143 /* Return a hash for VES.  */
 144
 145 inline hashval_t
 146 var_expand_hasher::hash (const value_type *ves)
 147 {
 148   return (hashval_t) INSN_UID (ves->insn);
 149 }
 150
 151 /* Return true if I1 and I2 refer to the same instruction.  */
 152
 153 inline bool
 154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 155 {
 156   return i1->insn == i2->insn;
 157 }
 158
 159 /* Information about optimization applied in
 160    the unrolled loop.  */
 161
 162 struct opt_info
 163 {
 164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 165                                                   split.  */
 166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 169                                         insns with accumulators to expand.  */
 170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 172   unsigned first_new_block;        /* The first basic block that was
 173                                       duplicated.  */
 174   basic_block loop_exit;           /* The loop exit basic block.  */
 175   basic_block loop_preheader;      /* The loop preheader basic block.  */
 176 };
 177
 178 static void decide_unrolling_and_peeling (int);
 179 static void peel_loops_completely (int);
 180 static void decide_peel_simple (struct loop *, int);
 181 static void decide_peel_once_rolling (struct loop *, int);
 182 static void decide_peel_completely (struct loop *, int);
 183 static void decide_unroll_stupid (struct loop *, int);
 184 static void decide_unroll_constant_iterations (struct loop *, int);
 185 static void decide_unroll_runtime_iterations (struct loop *, int);
 186 static void peel_loop_simple (struct loop *);
 187 static void peel_loop_completely (struct loop *);
 188 static void unroll_loop_stupid (struct loop *);
 189 static void unroll_loop_constant_iterations (struct loop *);
 190 static void unroll_loop_runtime_iterations (struct loop *);
 191 static struct opt_info *analyze_insns_in_loop (struct loop *);
 192 static void opt_info_start_duplication (struct opt_info *);
 193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 194 static void free_opt_info (struct opt_info *);
 195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 199 static void insert_var_expansion_initialization (struct var_to_expand *,
 200                                                  basic_block);
 201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 202                                              basic_block);
 203 static rtx get_expansion (struct var_to_expand *);
 204
 205 /* Emit a message summarizing the unroll or peel that will be
 206    performed for LOOP, along with the loop's location LOCUS, if
 207    appropriate given the dump or -fopt-info settings.  */
 208
 209 static void
 210 report_unroll_peel (struct loop *loop, location_t locus)
 211 {
 212   struct niter_desc *desc;
 213   int niters = 0;
 214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 215
 216   if (loop->lpt_decision.decision == LPT_NONE)
 217     return;
 218
 219   if (!dump_enabled_p ())
 220     return;
 221
 222   /* In the special case where the loop never iterated, emit
 223      a different message so that we don't report an unroll by 0.
 224      This matches the equivalent message emitted during tree unrolling.  */
 225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 226       && !loop->lpt_decision.times)
 227     {
 228       dump_printf_loc (report_flags, locus,
 229                        "loop turned into non-loop; it never loops.\n");
 230       return;
 231     }
 232
 233   desc = get_simple_loop_desc (loop);
 234
 235   if (desc->const_iter)
 236     niters = desc->niter;
 237   else if (loop->header->count)
 238     niters = expected_loop_iterations (loop);
 239
 240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 241     dump_printf_loc (report_flags, locus,
 242                      "loop with %d iterations completely unrolled",
 243                      loop->lpt_decision.times + 1);
 244   else
 245     dump_printf_loc (report_flags, locus,
 246                      "loop %s %d times",
 247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 248                        ? "peeled" : "unrolled"),
 249                      loop->lpt_decision.times);
 250   if (profile_info)
 251     dump_printf (report_flags,
 252                  " (header execution count %d",
 253                  (int)loop->header->count);
 254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255     dump_printf (report_flags,
 256                  "%s%s iterations %d)",
 257                  profile_info ? ", " : " (",
 258                  desc->const_iter ? "const" : "average",
 259                  niters);
 260   else if (profile_info)
 261     dump_printf (report_flags, ")");
 262
 263   dump_printf (report_flags, "\n");
 264 }
 265
 266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 267 void
 268 unroll_and_peel_loops (int flags)
 269 {
 270   struct loop *loop;
 271   bool changed = false;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   bool changed = false;
 348
 349   /* Scan the loops, the inner ones first.  */
 350   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 351     {
 352       loop->lpt_decision.decision = LPT_NONE;
 353       location_t locus = get_loop_location (loop);
 354
 355       if (dump_enabled_p ())
 356         dump_printf_loc (TDF_RTL, locus,
 357                          ";; *** Considering loop %d at BB %d for "
 358                          "complete peeling ***\n",
 359                          loop->num, loop->header->index);
 360
 361       loop->ninsns = num_loop_insns (loop);
 362
 363       decide_peel_once_rolling (loop, flags);
 364       if (loop->lpt_decision.decision == LPT_NONE)
 365         decide_peel_completely (loop, flags);
 366
 367       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 368         {
 369           report_unroll_peel (loop, locus);
 370           peel_loop_completely (loop);
 371           changed = true;
 372         }
 373     }
 374
 375     if (changed)
 376       {
 377         calculate_dominance_info (CDI_DOMINATORS);
 378         fix_loop_structure (NULL);
 379       }
 380 }
 381
 382 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 383 static void
 384 decide_unrolling_and_peeling (int flags)
 385 {
 386   struct loop *loop;
 387
 388   /* Scan the loops, inner ones first.  */
 389   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 390     {
 391       loop->lpt_decision.decision = LPT_NONE;
 392       location_t locus = get_loop_location (loop);
 393
 394       if (dump_enabled_p ())
 395         dump_printf_loc (TDF_RTL, locus,
 396                          ";; *** Considering loop %d at BB %d for "
 397                          "unrolling and peeling ***\n",
 398                          loop->num, loop->header->index);
 399
 400       /* Do not peel cold areas.  */
 401       if (optimize_loop_for_size_p (loop))
 402         {
 403           if (dump_file)
 404             fprintf (dump_file, ";; Not considering loop, cold area\n");
 405           continue;
 406         }
 407
 408       /* Can the loop be manipulated?  */
 409       if (!can_duplicate_loop_p (loop))
 410         {
 411           if (dump_file)
 412             fprintf (dump_file,
 413                      ";; Not considering loop, cannot duplicate\n");
 414           continue;
 415         }
 416
 417       /* Skip non-innermost loops.  */
 418       if (loop->inner)
 419         {
 420           if (dump_file)
 421             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 422           continue;
 423         }
 424
 425       loop->ninsns = num_loop_insns (loop);
 426       loop->av_ninsns = average_num_loop_insns (loop);
 427
 428       /* Try transformations one by one in decreasing order of
 429          priority.  */
 430
 431       decide_unroll_constant_iterations (loop, flags);
 432       if (loop->lpt_decision.decision == LPT_NONE)
 433         decide_unroll_runtime_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_stupid (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_peel_simple (loop, flags);
 438
 439       report_unroll_peel (loop, locus);
 440     }
 441 }
 442
 443 /* Decide whether the LOOP is once rolling and suitable for complete
 444    peeling.  */
 445 static void
 446 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 447 {
 448   struct niter_desc *desc;
 449
 450   if (dump_file)
 451     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 452
 453   /* Is the loop small enough?  */
 454   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 455     {
 456       if (dump_file)
 457         fprintf (dump_file, ";; Not considering loop, is too big\n");
 458       return;
 459     }
 460
 461   /* Check for simple loops.  */
 462   desc = get_simple_loop_desc (loop);
 463
 464   /* Check number of iterations.  */
 465   if (!desc->simple_p
 466       || desc->assumptions
 467       || desc->infinite
 468       || !desc->const_iter
 469       || (desc->niter != 0
 470           && get_max_loop_iterations_int (loop) != 0))
 471     {
 472       if (dump_file)
 473         fprintf (dump_file,
 474                  ";; Unable to prove that the loop rolls exactly once\n");
 475       return;
 476     }
 477
 478   /* Success.  */
 479   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 480 }
 481
 482 /* Decide whether the LOOP is suitable for complete peeling.  */
 483 static void
 484 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 485 {
 486   unsigned npeel;
 487   struct niter_desc *desc;
 488
 489   if (dump_file)
 490     fprintf (dump_file, "\n;; Considering peeling completely\n");
 491
 492   /* Skip non-innermost loops.  */
 493   if (loop->inner)
 494     {
 495       if (dump_file)
 496         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 497       return;
 498     }
 499
 500   /* Do not peel cold areas.  */
 501   if (optimize_loop_for_size_p (loop))
 502     {
 503       if (dump_file)
 504         fprintf (dump_file, ";; Not considering loop, cold area\n");
 505       return;
 506     }
 507
 508   /* Can the loop be manipulated?  */
 509   if (!can_duplicate_loop_p (loop))
 510     {
 511       if (dump_file)
 512         fprintf (dump_file,
 513                  ";; Not considering loop, cannot duplicate\n");
 514       return;
 515     }
 516
 517   /* npeel = number of iterations to peel.  */
 518   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 519   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 520     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 521
 522   /* Is the loop small enough?  */
 523   if (!npeel)
 524     {
 525       if (dump_file)
 526         fprintf (dump_file, ";; Not considering loop, is too big\n");
 527       return;
 528     }
 529
 530   /* Check for simple loops.  */
 531   desc = get_simple_loop_desc (loop);
 532
 533   /* Check number of iterations.  */
 534   if (!desc->simple_p
 535       || desc->assumptions
 536       || !desc->const_iter
 537       || desc->infinite)
 538     {
 539       if (dump_file)
 540         fprintf (dump_file,
 541                  ";; Unable to prove that the loop iterates constant times\n");
 542       return;
 543     }
 544
 545   if (desc->niter > npeel - 1)
 546     {
 547       if (dump_file)
 548         {
 549           fprintf (dump_file,
 550                    ";; Not peeling loop completely, rolls too much (");
 551           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 552           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 553         }
 554       return;
 555     }
 556
 557   /* Success.  */
 558   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 559 }
 560
 561 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 562    completely.  The transformation done:
 563
 564    for (i = 0; i < 4; i++)
 565      body;
 566
 567    ==>
 568
 569    i = 0;
 570    body; i++;
 571    body; i++;
 572    body; i++;
 573    body; i++;
 574    */
 575 static void
 576 peel_loop_completely (struct loop *loop)
 577 {
 578   sbitmap wont_exit;
 579   unsigned HOST_WIDE_INT npeel;
 580   unsigned i;
 581   edge ein;
 582   struct niter_desc *desc = get_simple_loop_desc (loop);
 583   struct opt_info *opt_info = NULL;
 584
 585   npeel = desc->niter;
 586
 587   if (npeel)
 588     {
 589       bool ok;
 590
 591       wont_exit = sbitmap_alloc (npeel + 1);
 592       bitmap_ones (wont_exit);
 593       bitmap_clear_bit (wont_exit, 0);
 594       if (desc->noloop_assumptions)
 595         bitmap_clear_bit (wont_exit, 1);
 596
 597       auto_vec<edge> remove_edges;
 598       if (flag_split_ivs_in_unroller)
 599         opt_info = analyze_insns_in_loop (loop);
 600
 601       opt_info_start_duplication (opt_info);
 602       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 603                                           npeel,
 604                                           wont_exit, desc->out_edge,
 605                                           &remove_edges,
 606                                           DLTHE_FLAG_UPDATE_FREQ
 607                                           | DLTHE_FLAG_COMPLETTE_PEEL
 608                                           | (opt_info
 609                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 610       gcc_assert (ok);
 611
 612       free (wont_exit);
 613
 614       if (opt_info)
 615         {
 616           apply_opt_in_copies (opt_info, npeel, false, true);
 617           free_opt_info (opt_info);
 618         }
 619
 620       /* Remove the exit edges.  */
 621       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 622         remove_path (ein);
 623     }
 624
 625   ein = desc->in_edge;
 626   free_simple_loop_desc (loop);
 627
 628   /* Now remove the unreachable part of the last iteration and cancel
 629      the loop.  */
 630   remove_path (ein);
 631
 632   if (dump_file)
 633     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 634 }
 635
 636 /* Decide whether to unroll LOOP iterating constant number of times
 637    and how much.  */
 638
 639 static void
 640 decide_unroll_constant_iterations (struct loop *loop, int flags)
 641 {
 642   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 643   struct niter_desc *desc;
 644   double_int iterations;
 645
 646   if (!(flags & UAP_UNROLL))
 647     {
 648       /* We were not asked to, just return back silently.  */
 649       return;
 650     }
 651
 652   if (dump_file)
 653     fprintf (dump_file,
 654              "\n;; Considering unrolling loop with constant "
 655              "number of iterations\n");
 656
 657   /* nunroll = total number of copies of the original loop body in
 658      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 659   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 660   nunroll_by_av
 661     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 662   if (nunroll > nunroll_by_av)
 663     nunroll = nunroll_by_av;
 664   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 665     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 666
 667   if (targetm.loop_unroll_adjust)
 668     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 669
 670   /* Skip big loops.  */
 671   if (nunroll <= 1)
 672     {
 673       if (dump_file)
 674         fprintf (dump_file, ";; Not considering loop, is too big\n");
 675       return;
 676     }
 677
 678   /* Check for simple loops.  */
 679   desc = get_simple_loop_desc (loop);
 680
 681   /* Check number of iterations.  */
 682   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 683     {
 684       if (dump_file)
 685         fprintf (dump_file,
 686                  ";; Unable to prove that the loop iterates constant times\n");
 687       return;
 688     }
 689
 690   /* Check whether the loop rolls enough to consider.
 691      Consult also loop bounds and profile; in the case the loop has more
 692      than one exit it may well loop less than determined maximal number
 693      of iterations.  */
 694   if (desc->niter < 2 * nunroll
 695       || ((get_estimated_loop_iterations (loop, &iterations)
 696            || get_max_loop_iterations (loop, &iterations))
 697           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 698     {
 699       if (dump_file)
 700         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 701       return;
 702     }
 703
 704   /* Success; now compute number of iterations to unroll.  We alter
 705      nunroll so that as few as possible copies of loop body are
 706      necessary, while still not decreasing the number of unrollings
 707      too much (at most by 1).  */
 708   best_copies = 2 * nunroll + 10;
 709
 710   i = 2 * nunroll + 2;
 711   if (i - 1 >= desc->niter)
 712     i = desc->niter - 2;
 713
 714   for (; i >= nunroll - 1; i--)
 715     {
 716       unsigned exit_mod = desc->niter % (i + 1);
 717
 718       if (!loop_exit_at_end_p (loop))
 719         n_copies = exit_mod + i + 1;
 720       else if (exit_mod != (unsigned) i
 721                || desc->noloop_assumptions != NULL_RTX)
 722         n_copies = exit_mod + i + 2;
 723       else
 724         n_copies = i + 1;
 725
 726       if (n_copies < best_copies)
 727         {
 728           best_copies = n_copies;
 729           best_unroll = i;
 730         }
 731     }
 732
 733   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 734   loop->lpt_decision.times = best_unroll;
 735 }
 736
 737 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 738    The transformation does this:
 739
 740    for (i = 0; i < 102; i++)
 741      body;
 742
 743    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 744
 745    i = 0;
 746    body; i++;
 747    body; i++;
 748    while (i < 102)
 749      {
 750        body; i++;
 751        body; i++;
 752        body; i++;
 753        body; i++;
 754      }
 755   */
 756 static void
 757 unroll_loop_constant_iterations (struct loop *loop)
 758 {
 759   unsigned HOST_WIDE_INT niter;
 760   unsigned exit_mod;
 761   sbitmap wont_exit;
 762   unsigned i;
 763   edge e;
 764   unsigned max_unroll = loop->lpt_decision.times;
 765   struct niter_desc *desc = get_simple_loop_desc (loop);
 766   bool exit_at_end = loop_exit_at_end_p (loop);
 767   struct opt_info *opt_info = NULL;
 768   bool ok;
 769
 770   niter = desc->niter;
 771
 772   /* Should not get here (such loop should be peeled instead).  */
 773   gcc_assert (niter > max_unroll + 1);
 774
 775   exit_mod = niter % (max_unroll + 1);
 776
 777   wont_exit = sbitmap_alloc (max_unroll + 1);
 778   bitmap_ones (wont_exit);
 779
 780   auto_vec<edge> remove_edges;
 781   if (flag_split_ivs_in_unroller
 782       || flag_variable_expansion_in_unroller)
 783     opt_info = analyze_insns_in_loop (loop);
 784
 785   if (!exit_at_end)
 786     {
 787       /* The exit is not at the end of the loop; leave exit test
 788          in the first copy, so that the loops that start with test
 789          of exit condition have continuous body after unrolling.  */
 790
 791       if (dump_file)
 792         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 793
 794       /* Peel exit_mod iterations.  */
 795       bitmap_clear_bit (wont_exit, 0);
 796       if (desc->noloop_assumptions)
 797         bitmap_clear_bit (wont_exit, 1);
 798
 799       if (exit_mod)
 800         {
 801           opt_info_start_duplication (opt_info);
 802           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 803                                               exit_mod,
 804                                               wont_exit, desc->out_edge,
 805                                               &remove_edges,
 806                                               DLTHE_FLAG_UPDATE_FREQ
 807                                               | (opt_info && exit_mod > 1
 808                                                  ? DLTHE_RECORD_COPY_NUMBER
 809                                                    : 0));
 810           gcc_assert (ok);
 811
 812           if (opt_info && exit_mod > 1)
 813             apply_opt_in_copies (opt_info, exit_mod, false, false);
 814
 815           desc->noloop_assumptions = NULL_RTX;
 816           desc->niter -= exit_mod;
 817           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 818           if (loop->any_estimate
 819               && double_int::from_uhwi (exit_mod).ule
 820                    (loop->nb_iterations_estimate))
 821             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 822           else
 823             loop->any_estimate = false;
 824         }
 825
 826       bitmap_set_bit (wont_exit, 1);
 827     }
 828   else
 829     {
 830       /* Leave exit test in last copy, for the same reason as above if
 831          the loop tests the condition at the end of loop body.  */
 832
 833       if (dump_file)
 834         fprintf (dump_file, ";; Condition at end of loop.\n");
 835
 836       /* We know that niter >= max_unroll + 2; so we do not need to care of
 837          case when we would exit before reaching the loop.  So just peel
 838          exit_mod + 1 iterations.  */
 839       if (exit_mod != max_unroll
 840           || desc->noloop_assumptions)
 841         {
 842           bitmap_clear_bit (wont_exit, 0);
 843           if (desc->noloop_assumptions)
 844             bitmap_clear_bit (wont_exit, 1);
 845
 846           opt_info_start_duplication (opt_info);
 847           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 848                                               exit_mod + 1,
 849                                               wont_exit, desc->out_edge,
 850                                               &remove_edges,
 851                                               DLTHE_FLAG_UPDATE_FREQ
 852                                               | (opt_info && exit_mod > 0
 853                                                  ? DLTHE_RECORD_COPY_NUMBER
 854                                                    : 0));
 855           gcc_assert (ok);
 856
 857           if (opt_info && exit_mod > 0)
 858             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 859
 860           desc->niter -= exit_mod + 1;
 861           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 862           if (loop->any_estimate
 863               && double_int::from_uhwi (exit_mod + 1).ule
 864                    (loop->nb_iterations_estimate))
 865             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 866           else
 867             loop->any_estimate = false;
 868           desc->noloop_assumptions = NULL_RTX;
 869
 870           bitmap_set_bit (wont_exit, 0);
 871           bitmap_set_bit (wont_exit, 1);
 872         }
 873
 874       bitmap_clear_bit (wont_exit, max_unroll);
 875     }
 876
 877   /* Now unroll the loop.  */
 878
 879   opt_info_start_duplication (opt_info);
 880   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 881                                       max_unroll,
 882                                       wont_exit, desc->out_edge,
 883                                       &remove_edges,
 884                                       DLTHE_FLAG_UPDATE_FREQ
 885                                       | (opt_info
 886                                          ? DLTHE_RECORD_COPY_NUMBER
 887                                            : 0));
 888   gcc_assert (ok);
 889
 890   if (opt_info)
 891     {
 892       apply_opt_in_copies (opt_info, max_unroll, true, true);
 893       free_opt_info (opt_info);
 894     }
 895
 896   free (wont_exit);
 897
 898   if (exit_at_end)
 899     {
 900       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 901       /* Find a new in and out edge; they are in the last copy we have made.  */
 902
 903       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 904         {
 905           desc->out_edge = EDGE_SUCC (exit_block, 0);
 906           desc->in_edge = EDGE_SUCC (exit_block, 1);
 907         }
 908       else
 909         {
 910           desc->out_edge = EDGE_SUCC (exit_block, 1);
 911           desc->in_edge = EDGE_SUCC (exit_block, 0);
 912         }
 913     }
 914
 915   desc->niter /= max_unroll + 1;
 916   loop->nb_iterations_upper_bound
 917     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 918                                                                    + 1),
 919                                             TRUNC_DIV_EXPR);
 920   if (loop->any_estimate)
 921     loop->nb_iterations_estimate
 922       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 923                                                                   + 1),
 924                                            TRUNC_DIV_EXPR);
 925   desc->niter_expr = GEN_INT (desc->niter);
 926
 927   /* Remove the edges.  */
 928   FOR_EACH_VEC_ELT (remove_edges, i, e)
 929     remove_path (e);
 930
 931   if (dump_file)
 932     fprintf (dump_file,
 933              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 934              max_unroll, num_loop_insns (loop));
 935 }
 936
 937 /* Decide whether to unroll LOOP iterating runtime computable number of times
 938    and how much.  */
 939 static void
 940 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 941 {
 942   unsigned nunroll, nunroll_by_av, i;
 943   struct niter_desc *desc;
 944   double_int iterations;
 945
 946   if (!(flags & UAP_UNROLL))
 947     {
 948       /* We were not asked to, just return back silently.  */
 949       return;
 950     }
 951
 952   if (dump_file)
 953     fprintf (dump_file,
 954              "\n;; Considering unrolling loop with runtime "
 955              "computable number of iterations\n");
 956
 957   /* nunroll = total number of copies of the original loop body in
 958      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 959   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 960   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 961   if (nunroll > nunroll_by_av)
 962     nunroll = nunroll_by_av;
 963   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 964     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 965
 966   if (targetm.loop_unroll_adjust)
 967     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 968
 969   /* Skip big loops.  */
 970   if (nunroll <= 1)
 971     {
 972       if (dump_file)
 973         fprintf (dump_file, ";; Not considering loop, is too big\n");
 974       return;
 975     }
 976
 977   /* Check for simple loops.  */
 978   desc = get_simple_loop_desc (loop);
 979
 980   /* Check simpleness.  */
 981   if (!desc->simple_p || desc->assumptions)
 982     {
 983       if (dump_file)
 984         fprintf (dump_file,
 985                  ";; Unable to prove that the number of iterations "
 986                  "can be counted in runtime\n");
 987       return;
 988     }
 989
 990   if (desc->const_iter)
 991     {
 992       if (dump_file)
 993         fprintf (dump_file, ";; Loop iterates constant times\n");
 994       return;
 995     }
 996
 997   /* Check whether the loop rolls.  */
 998   if ((get_estimated_loop_iterations (loop, &iterations)
 999        || get_max_loop_iterations (loop, &iterations))
1000       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1001     {
1002       if (dump_file)
1003         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1004       return;
1005     }
1006
1007   /* Success; now force nunroll to be power of 2, as we are unable to
1008      cope with overflows in computation of number of iterations.  */
1009   for (i = 1; 2 * i <= nunroll; i *= 2)
1010     continue;
1011
1012   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1013   loop->lpt_decision.times = i - 1;
1014 }
1015
1016 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1017    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1018    and NULL is returned instead.  */
1019
1020 basic_block
1021 split_edge_and_insert (edge e, rtx insns)
1022 {
1023   basic_block bb;
1024
1025   if (!insns)
1026     return NULL;
1027   bb = split_edge (e);
1028   emit_insn_after (insns, BB_END (bb));
1029
1030   /* ??? We used to assume that INSNS can contain control flow insns, and
1031      that we had to try to find sub basic blocks in BB to maintain a valid
1032      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1033      and call break_superblocks when going out of cfglayout mode.  But it
1034      turns out that this never happens; and that if it does ever happen,
1035      the TODO_verify_flow at the end of the RTL loop passes would fail.
1036
1037      There are two reasons why we expected we could have control flow insns
1038      in INSNS.  The first is when a comparison has to be done in parts, and
1039      the second is when the number of iterations is computed for loops with
1040      the number of iterations known at runtime.  In both cases, test cases
1041      to get control flow in INSNS appear to be impossible to construct:
1042
1043       * If do_compare_rtx_and_jump needs several branches to do comparison
1044         in a mode that needs comparison by parts, we cannot analyze the
1045         number of iterations of the loop, and we never get to unrolling it.
1046
1047       * The code in expand_divmod that was suspected to cause creation of
1048         branching code seems to be only accessed for signed division.  The
1049         divisions used by # of iterations analysis are always unsigned.
1050         Problems might arise on architectures that emits branching code
1051         for some operations that may appear in the unroller (especially
1052         for division), but we have no such architectures.
1053
1054      Considering all this, it was decided that we should for now assume
1055      that INSNS can in theory contain control flow insns, but in practice
1056      it never does.  So we don't handle the theoretical case, and should
1057      a real failure ever show up, we have a pretty good clue for how to
1058      fix it.  */
1059
1060   return bb;
1061 }
1062
1063 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1064    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
1065    in order to create a jump.  */
1066
1067 static rtx
1068 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1069                       rtx cinsn)
1070 {
1071   rtx seq, jump, cond;
1072   enum machine_mode mode;
1073
1074   mode = GET_MODE (op0);
1075   if (mode == VOIDmode)
1076     mode = GET_MODE (op1);
1077
1078   start_sequence ();
1079   if (GET_MODE_CLASS (mode) == MODE_CC)
1080     {
1081       /* A hack -- there seems to be no easy generic way how to make a
1082          conditional jump from a ccmode comparison.  */
1083       gcc_assert (cinsn);
1084       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1085       gcc_assert (GET_CODE (cond) == comp);
1086       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1087       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1088       emit_jump_insn (copy_insn (PATTERN (cinsn)));
1089       jump = get_last_insn ();
1090       gcc_assert (JUMP_P (jump));
1091       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1092       LABEL_NUSES (JUMP_LABEL (jump))++;
1093       redirect_jump (jump, label, 0);
1094     }
1095   else
1096     {
1097       gcc_assert (!cinsn);
1098
1099       op0 = force_operand (op0, NULL_RTX);
1100       op1 = force_operand (op1, NULL_RTX);
1101       do_compare_rtx_and_jump (op0, op1, comp, 0,
1102                                mode, NULL_RTX, NULL_RTX, label, -1);
1103       jump = get_last_insn ();
1104       gcc_assert (JUMP_P (jump));
1105       JUMP_LABEL (jump) = label;
1106       LABEL_NUSES (label)++;
1107     }
1108   add_int_reg_note (jump, REG_BR_PROB, prob);
1109
1110   seq = get_insns ();
1111   end_sequence ();
1112
1113   return seq;
1114 }
1115
1116 /* Unroll LOOP for which we are able to count number of iterations in runtime
1117    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1118    extra care for case n < 0):
1119
1120    for (i = 0; i < n; i++)
1121      body;
1122
1123    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1124
1125    i = 0;
1126    mod = n % 4;
1127
1128    switch (mod)
1129      {
1130        case 3:
1131          body; i++;
1132        case 2:
1133          body; i++;
1134        case 1:
1135          body; i++;
1136        case 0: ;
1137      }
1138
1139    while (i < n)
1140      {
1141        body; i++;
1142        body; i++;
1143        body; i++;
1144        body; i++;
1145      }
1146    */
1147 static void
1148 unroll_loop_runtime_iterations (struct loop *loop)
1149 {
1150   rtx old_niter, niter, init_code, branch_code, tmp;
1151   unsigned i, j, p;
1152   basic_block preheader, *body, swtch, ezc_swtch;
1153   sbitmap wont_exit;
1154   int may_exit_copy;
1155   unsigned n_peel;
1156   edge e;
1157   bool extra_zero_check, last_may_exit;
1158   unsigned max_unroll = loop->lpt_decision.times;
1159   struct niter_desc *desc = get_simple_loop_desc (loop);
1160   bool exit_at_end = loop_exit_at_end_p (loop);
1161   struct opt_info *opt_info = NULL;
1162   bool ok;
1163
1164   if (flag_split_ivs_in_unroller
1165       || flag_variable_expansion_in_unroller)
1166     opt_info = analyze_insns_in_loop (loop);
1167
1168   /* Remember blocks whose dominators will have to be updated.  */
1169   auto_vec<basic_block> dom_bbs;
1170
1171   body = get_loop_body (loop);
1172   for (i = 0; i < loop->num_nodes; i++)
1173     {
1174       vec<basic_block> ldom;
1175       basic_block bb;
1176
1177       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1178       FOR_EACH_VEC_ELT (ldom, j, bb)
1179         if (!flow_bb_inside_loop_p (loop, bb))
1180           dom_bbs.safe_push (bb);
1181
1182       ldom.release ();
1183     }
1184   free (body);
1185
1186   if (!exit_at_end)
1187     {
1188       /* Leave exit in first copy (for explanation why see comment in
1189          unroll_loop_constant_iterations).  */
1190       may_exit_copy = 0;
1191       n_peel = max_unroll - 1;
1192       extra_zero_check = true;
1193       last_may_exit = false;
1194     }
1195   else
1196     {
1197       /* Leave exit in last copy (for explanation why see comment in
1198          unroll_loop_constant_iterations).  */
1199       may_exit_copy = max_unroll;
1200       n_peel = max_unroll;
1201       extra_zero_check = false;
1202       last_may_exit = true;
1203     }
1204
1205   /* Get expression for number of iterations.  */
1206   start_sequence ();
1207   old_niter = niter = gen_reg_rtx (desc->mode);
1208   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1209   if (tmp != niter)
1210     emit_move_insn (niter, tmp);
1211
1212   /* Count modulo by ANDing it with max_unroll; we use the fact that
1213      the number of unrollings is a power of two, and thus this is correct
1214      even if there is overflow in the computation.  */
1215   niter = expand_simple_binop (desc->mode, AND,
1216                                niter, gen_int_mode (max_unroll, desc->mode),
1217                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1218
1219   init_code = get_insns ();
1220   end_sequence ();
1221   unshare_all_rtl_in_chain (init_code);
1222
1223   /* Precondition the loop.  */
1224   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1225
1226   auto_vec<edge> remove_edges;
1227
1228   wont_exit = sbitmap_alloc (max_unroll + 2);
1229
1230   /* Peel the first copy of loop body (almost always we must leave exit test
1231      here; the only exception is when we have extra zero check and the number
1232      of iterations is reliable.  Also record the place of (possible) extra
1233      zero check.  */
1234   bitmap_clear (wont_exit);
1235   if (extra_zero_check
1236       && !desc->noloop_assumptions)
1237     bitmap_set_bit (wont_exit, 1);
1238   ezc_swtch = loop_preheader_edge (loop)->src;
1239   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1240                                       1, wont_exit, desc->out_edge,
1241                                       &remove_edges,
1242                                       DLTHE_FLAG_UPDATE_FREQ);
1243   gcc_assert (ok);
1244
1245   /* Record the place where switch will be built for preconditioning.  */
1246   swtch = split_edge (loop_preheader_edge (loop));
1247
1248   for (i = 0; i < n_peel; i++)
1249     {
1250       /* Peel the copy.  */
1251       bitmap_clear (wont_exit);
1252       if (i != n_peel - 1 || !last_may_exit)
1253         bitmap_set_bit (wont_exit, 1);
1254       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1255                                           1, wont_exit, desc->out_edge,
1256                                           &remove_edges,
1257                                           DLTHE_FLAG_UPDATE_FREQ);
1258       gcc_assert (ok);
1259
1260       /* Create item for switch.  */
1261       j = n_peel - i - (extra_zero_check ? 0 : 1);
1262       p = REG_BR_PROB_BASE / (i + 2);
1263
1264       preheader = split_edge (loop_preheader_edge (loop));
1265       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1266                                           block_label (preheader), p,
1267                                           NULL_RTX);
1268
1269       /* We rely on the fact that the compare and jump cannot be optimized out,
1270          and hence the cfg we create is correct.  */
1271       gcc_assert (branch_code != NULL_RTX);
1272
1273       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1274       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1275       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1276       e = make_edge (swtch, preheader,
1277                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1278       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1279       e->probability = p;
1280     }
1281
1282   if (extra_zero_check)
1283     {
1284       /* Add branch for zero iterations.  */
1285       p = REG_BR_PROB_BASE / (max_unroll + 1);
1286       swtch = ezc_swtch;
1287       preheader = split_edge (loop_preheader_edge (loop));
1288       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1289                                           block_label (preheader), p,
1290                                           NULL_RTX);
1291       gcc_assert (branch_code != NULL_RTX);
1292
1293       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1294       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1295       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1296       e = make_edge (swtch, preheader,
1297                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1298       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1299       e->probability = p;
1300     }
1301
1302   /* Recount dominators for outer blocks.  */
1303   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1304
1305   /* And unroll loop.  */
1306
1307   bitmap_ones (wont_exit);
1308   bitmap_clear_bit (wont_exit, may_exit_copy);
1309   opt_info_start_duplication (opt_info);
1310
1311   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1312                                       max_unroll,
1313                                       wont_exit, desc->out_edge,
1314                                       &remove_edges,
1315                                       DLTHE_FLAG_UPDATE_FREQ
1316                                       | (opt_info
1317                                          ? DLTHE_RECORD_COPY_NUMBER
1318                                            : 0));
1319   gcc_assert (ok);
1320
1321   if (opt_info)
1322     {
1323       apply_opt_in_copies (opt_info, max_unroll, true, true);
1324       free_opt_info (opt_info);
1325     }
1326
1327   free (wont_exit);
1328
1329   if (exit_at_end)
1330     {
1331       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1332       /* Find a new in and out edge; they are in the last copy we have
1333          made.  */
1334
1335       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1336         {
1337           desc->out_edge = EDGE_SUCC (exit_block, 0);
1338           desc->in_edge = EDGE_SUCC (exit_block, 1);
1339         }
1340       else
1341         {
1342           desc->out_edge = EDGE_SUCC (exit_block, 1);
1343           desc->in_edge = EDGE_SUCC (exit_block, 0);
1344         }
1345     }
1346
1347   /* Remove the edges.  */
1348   FOR_EACH_VEC_ELT (remove_edges, i, e)
1349     remove_path (e);
1350
1351   /* We must be careful when updating the number of iterations due to
1352      preconditioning and the fact that the value must be valid at entry
1353      of the loop.  After passing through the above code, we see that
1354      the correct new number of iterations is this:  */
1355   gcc_assert (!desc->const_iter);
1356   desc->niter_expr =
1357     simplify_gen_binary (UDIV, desc->mode, old_niter,
1358                          gen_int_mode (max_unroll + 1, desc->mode));
1359   loop->nb_iterations_upper_bound
1360     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1361                                                                    + 1),
1362                                             TRUNC_DIV_EXPR);
1363   if (loop->any_estimate)
1364     loop->nb_iterations_estimate
1365       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1366                                                                   + 1),
1367                                            TRUNC_DIV_EXPR);
1368   if (exit_at_end)
1369     {
1370       desc->niter_expr =
1371         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1372       desc->noloop_assumptions = NULL_RTX;
1373       --loop->nb_iterations_upper_bound;
1374       if (loop->any_estimate
1375           && loop->nb_iterations_estimate != double_int_zero)
1376         --loop->nb_iterations_estimate;
1377       else
1378         loop->any_estimate = false;
1379     }
1380
1381   if (dump_file)
1382     fprintf (dump_file,
1383              ";; Unrolled loop %d times, counting # of iterations "
1384              "in runtime, %i insns\n",
1385              max_unroll, num_loop_insns (loop));
1386 }
1387
1388 /* Decide whether to simply peel LOOP and how much.  */
1389 static void
1390 decide_peel_simple (struct loop *loop, int flags)
1391 {
1392   unsigned npeel;
1393   double_int iterations;
1394
1395   if (!(flags & UAP_PEEL))
1396     {
1397       /* We were not asked to, just return back silently.  */
1398       return;
1399     }
1400
1401   if (dump_file)
1402     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1403
1404   /* npeel = number of iterations to peel.  */
1405   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1406   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1407     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1408
1409   /* Skip big loops.  */
1410   if (!npeel)
1411     {
1412       if (dump_file)
1413         fprintf (dump_file, ";; Not considering loop, is too big\n");
1414       return;
1415     }
1416
1417   /* Do not simply peel loops with branches inside -- it increases number
1418      of mispredicts.
1419      Exception is when we do have profile and we however have good chance
1420      to peel proper number of iterations loop will iterate in practice.
1421      TODO: this heuristic needs tunning; while for complette unrolling
1422      the branch inside loop mostly eliminates any improvements, for
1423      peeling it is not the case.  Also a function call inside loop is
1424      also branch from branch prediction POV (and probably better reason
1425      to not unroll/peel).  */
1426   if (num_loop_branches (loop) > 1
1427       && profile_status_for_fn (cfun) != PROFILE_READ)
1428     {
1429       if (dump_file)
1430         fprintf (dump_file, ";; Not peeling, contains branches\n");
1431       return;
1432     }
1433
1434   /* If we have realistic estimate on number of iterations, use it.  */
1435   if (get_estimated_loop_iterations (loop, &iterations))
1436     {
1437       if (double_int::from_shwi (npeel).ule (iterations))
1438         {
1439           if (dump_file)
1440             {
1441               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1442               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1443                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1444               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1445                        npeel);
1446             }
1447           return;
1448         }
1449       npeel = iterations.to_shwi () + 1;
1450     }
1451   /* If we have small enough bound on iterations, we can still peel (completely
1452      unroll).  */
1453   else if (get_max_loop_iterations (loop, &iterations)
1454            && iterations.ult (double_int::from_shwi (npeel)))
1455     npeel = iterations.to_shwi () + 1;
1456   else
1457     {
1458       /* For now we have no good heuristics to decide whether loop peeling
1459          will be effective, so disable it.  */
1460       if (dump_file)
1461         fprintf (dump_file,
1462                  ";; Not peeling loop, no evidence it will be profitable\n");
1463       return;
1464     }
1465
1466   /* Success.  */
1467   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1468   loop->lpt_decision.times = npeel;
1469 }
1470
1471 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1472
1473    while (cond)
1474      body;
1475
1476    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1477
1478    if (!cond) goto end;
1479    body;
1480    if (!cond) goto end;
1481    body;
1482    if (!cond) goto end;
1483    body;
1484    while (cond)
1485      body;
1486    end: ;
1487    */
1488 static void
1489 peel_loop_simple (struct loop *loop)
1490 {
1491   sbitmap wont_exit;
1492   unsigned npeel = loop->lpt_decision.times;
1493   struct niter_desc *desc = get_simple_loop_desc (loop);
1494   struct opt_info *opt_info = NULL;
1495   bool ok;
1496
1497   if (flag_split_ivs_in_unroller && npeel > 1)
1498     opt_info = analyze_insns_in_loop (loop);
1499
1500   wont_exit = sbitmap_alloc (npeel + 1);
1501   bitmap_clear (wont_exit);
1502
1503   opt_info_start_duplication (opt_info);
1504
1505   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1506                                       npeel, wont_exit, NULL,
1507                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1508                                       | (opt_info
1509                                          ? DLTHE_RECORD_COPY_NUMBER
1510                                            : 0));
1511   gcc_assert (ok);
1512
1513   free (wont_exit);
1514
1515   if (opt_info)
1516     {
1517       apply_opt_in_copies (opt_info, npeel, false, false);
1518       free_opt_info (opt_info);
1519     }
1520
1521   if (desc->simple_p)
1522     {
1523       if (desc->const_iter)
1524         {
1525           desc->niter -= npeel;
1526           desc->niter_expr = GEN_INT (desc->niter);
1527           desc->noloop_assumptions = NULL_RTX;
1528         }
1529       else
1530         {
1531           /* We cannot just update niter_expr, as its value might be clobbered
1532              inside loop.  We could handle this by counting the number into
1533              temporary just like we do in runtime unrolling, but it does not
1534              seem worthwhile.  */
1535           free_simple_loop_desc (loop);
1536         }
1537     }
1538   if (dump_file)
1539     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1540 }
1541
1542 /* Decide whether to unroll LOOP stupidly and how much.  */
1543 static void
1544 decide_unroll_stupid (struct loop *loop, int flags)
1545 {
1546   unsigned nunroll, nunroll_by_av, i;
1547   struct niter_desc *desc;
1548   double_int iterations;
1549
1550   if (!(flags & UAP_UNROLL_ALL))
1551     {
1552       /* We were not asked to, just return back silently.  */
1553       return;
1554     }
1555
1556   if (dump_file)
1557     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1558
1559   /* nunroll = total number of copies of the original loop body in
1560      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1561   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1562   nunroll_by_av
1563     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1564   if (nunroll > nunroll_by_av)
1565     nunroll = nunroll_by_av;
1566   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1567     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1568
1569   if (targetm.loop_unroll_adjust)
1570     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1571
1572   /* Skip big loops.  */
1573   if (nunroll <= 1)
1574     {
1575       if (dump_file)
1576         fprintf (dump_file, ";; Not considering loop, is too big\n");
1577       return;
1578     }
1579
1580   /* Check for simple loops.  */
1581   desc = get_simple_loop_desc (loop);
1582
1583   /* Check simpleness.  */
1584   if (desc->simple_p && !desc->assumptions)
1585     {
1586       if (dump_file)
1587         fprintf (dump_file, ";; The loop is simple\n");
1588       return;
1589     }
1590
1591   /* Do not unroll loops with branches inside -- it increases number
1592      of mispredicts.
1593      TODO: this heuristic needs tunning; call inside the loop body
1594      is also relatively good reason to not unroll.  */
1595   if (num_loop_branches (loop) > 1)
1596     {
1597       if (dump_file)
1598         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1599       return;
1600     }
1601
1602   /* Check whether the loop rolls.  */
1603   if ((get_estimated_loop_iterations (loop, &iterations)
1604        || get_max_loop_iterations (loop, &iterations))
1605       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1606     {
1607       if (dump_file)
1608         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1609       return;
1610     }
1611
1612   /* Success.  Now force nunroll to be power of 2, as it seems that this
1613      improves results (partially because of better alignments, partially
1614      because of some dark magic).  */
1615   for (i = 1; 2 * i <= nunroll; i *= 2)
1616     continue;
1617
1618   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1619   loop->lpt_decision.times = i - 1;
1620 }
1621
1622 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1623
1624    while (cond)
1625      body;
1626
1627    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1628
1629    while (cond)
1630      {
1631        body;
1632        if (!cond) break;
1633        body;
1634        if (!cond) break;
1635        body;
1636        if (!cond) break;
1637        body;
1638      }
1639    */
1640 static void
1641 unroll_loop_stupid (struct loop *loop)
1642 {
1643   sbitmap wont_exit;
1644   unsigned nunroll = loop->lpt_decision.times;
1645   struct niter_desc *desc = get_simple_loop_desc (loop);
1646   struct opt_info *opt_info = NULL;
1647   bool ok;
1648
1649   if (flag_split_ivs_in_unroller
1650       || flag_variable_expansion_in_unroller)
1651     opt_info = analyze_insns_in_loop (loop);
1652
1653
1654   wont_exit = sbitmap_alloc (nunroll + 1);
1655   bitmap_clear (wont_exit);
1656   opt_info_start_duplication (opt_info);
1657
1658   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1659                                       nunroll, wont_exit,
1660                                       NULL, NULL,
1661                                       DLTHE_FLAG_UPDATE_FREQ
1662                                       | (opt_info
1663                                          ? DLTHE_RECORD_COPY_NUMBER
1664                                            : 0));
1665   gcc_assert (ok);
1666
1667   if (opt_info)
1668     {
1669       apply_opt_in_copies (opt_info, nunroll, true, true);
1670       free_opt_info (opt_info);
1671     }
1672
1673   free (wont_exit);
1674
1675   if (desc->simple_p)
1676     {
1677       /* We indeed may get here provided that there are nontrivial assumptions
1678          for a loop to be really simple.  We could update the counts, but the
1679          problem is that we are unable to decide which exit will be taken
1680          (not really true in case the number of iterations is constant,
1681          but no one will do anything with this information, so we do not
1682          worry about it).  */
1683       desc->simple_p = false;
1684     }
1685
1686   if (dump_file)
1687     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1688              nunroll, num_loop_insns (loop));
1689 }
1690
1691 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1692    Set *DEBUG_USES to the number of debug insns that reference the
1693    variable.  */
1694
1695 bool
1696 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1697                                   int *debug_uses)
1698 {
1699   basic_block *body, bb;
1700   unsigned i;
1701   int count_ref = 0;
1702   rtx insn;
1703
1704   body = get_loop_body (loop);
1705   for (i = 0; i < loop->num_nodes; i++)
1706     {
1707       bb = body[i];
1708
1709       FOR_BB_INSNS (bb, insn)
1710         if (!rtx_referenced_p (reg, insn))
1711           continue;
1712         else if (DEBUG_INSN_P (insn))
1713           ++*debug_uses;
1714         else if (++count_ref > 1)
1715           break;
1716     }
1717   free (body);
1718   return (count_ref  == 1);
1719 }
1720
1721 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1722
1723 static void
1724 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1725 {
1726   basic_block *body, bb;
1727   unsigned i;
1728   rtx insn;
1729
1730   body = get_loop_body (loop);
1731   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1732     {
1733       bb = body[i];
1734
1735       FOR_BB_INSNS (bb, insn)
1736         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1737           continue;
1738         else
1739           {
1740             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1741                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1742             if (!--debug_uses)
1743               break;
1744           }
1745     }
1746   free (body);
1747 }
1748
1749 /* Determine whether INSN contains an accumulator
1750    which can be expanded into separate copies,
1751    one for each copy of the LOOP body.
1752
1753    for (i = 0 ; i < n; i++)
1754      sum += a[i];
1755
1756    ==>
1757
1758    sum += a[i]
1759    ....
1760    i = i+1;
1761    sum1 += a[i]
1762    ....
1763    i = i+1
1764    sum2 += a[i];
1765    ....
1766
1767    Return NULL if INSN contains no opportunity for expansion of accumulator.
1768    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1769    information and return a pointer to it.
1770 */
1771
1772 static struct var_to_expand *
1773 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1774 {
1775   rtx set, dest, src;
1776   struct var_to_expand *ves;
1777   unsigned accum_pos;
1778   enum rtx_code code;
1779   int debug_uses = 0;
1780
1781   set = single_set (insn);
1782   if (!set)
1783     return NULL;
1784
1785   dest = SET_DEST (set);
1786   src = SET_SRC (set);
1787   code = GET_CODE (src);
1788
1789   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1790     return NULL;
1791
1792   if (FLOAT_MODE_P (GET_MODE (dest)))
1793     {
1794       if (!flag_associative_math)
1795         return NULL;
1796       /* In the case of FMA, we're also changing the rounding.  */
1797       if (code == FMA && !flag_unsafe_math_optimizations)
1798         return NULL;
1799     }
1800
1801   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1802      in MD.  But if there is no optab to generate the insn, we can not
1803      perform the variable expansion.  This can happen if an MD provides
1804      an insn but not a named pattern to generate it, for example to avoid
1805      producing code that needs additional mode switches like for x87/mmx.
1806
1807      So we check have_insn_for which looks for an optab for the operation
1808      in SRC.  If it doesn't exist, we can't perform the expansion even
1809      though INSN is valid.  */
1810   if (!have_insn_for (code, GET_MODE (src)))
1811     return NULL;
1812
1813   if (!REG_P (dest)
1814       && !(GET_CODE (dest) == SUBREG
1815            && REG_P (SUBREG_REG (dest))))
1816     return NULL;
1817
1818   /* Find the accumulator use within the operation.  */
1819   if (code == FMA)
1820     {
1821       /* We only support accumulation via FMA in the ADD position.  */
1822       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1823         return NULL;
1824       accum_pos = 2;
1825     }
1826   else if (rtx_equal_p (dest, XEXP (src, 0)))
1827     accum_pos = 0;
1828   else if (rtx_equal_p (dest, XEXP (src, 1)))
1829     {
1830       /* The method of expansion that we are using; which includes the
1831          initialization of the expansions with zero and the summation of
1832          the expansions at the end of the computation will yield wrong
1833          results for (x = something - x) thus avoid using it in that case.  */
1834       if (code == MINUS)
1835         return NULL;
1836       accum_pos = 1;
1837     }
1838   else
1839     return NULL;
1840
1841   /* It must not otherwise be used.  */
1842   if (code == FMA)
1843     {
1844       if (rtx_referenced_p (dest, XEXP (src, 0))
1845           || rtx_referenced_p (dest, XEXP (src, 1)))
1846         return NULL;
1847     }
1848   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1849     return NULL;
1850
1851   /* It must be used in exactly one insn.  */
1852   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1853     return NULL;
1854
1855   if (dump_file)
1856     {
1857       fprintf (dump_file, "\n;; Expanding Accumulator ");
1858       print_rtl (dump_file, dest);
1859       fprintf (dump_file, "\n");
1860     }
1861
1862   if (debug_uses)
1863     /* Instead of resetting the debug insns, we could replace each
1864        debug use in the loop with the sum or product of all expanded
1865        accummulators.  Since we'll only know of all expansions at the
1866        end, we'd have to keep track of which vars_to_expand a debug
1867        insn in the loop references, take note of each copy of the
1868        debug insn during unrolling, and when it's all done, compute
1869        the sum or product of each variable and adjust the original
1870        debug insn and each copy thereof.  What a pain!  */
1871     reset_debug_uses_in_loop (loop, dest, debug_uses);
1872
1873   /* Record the accumulator to expand.  */
1874   ves = XNEW (struct var_to_expand);
1875   ves->insn = insn;
1876   ves->reg = copy_rtx (dest);
1877   ves->var_expansions.create (1);
1878   ves->next = NULL;
1879   ves->op = GET_CODE (src);
1880   ves->expansion_count = 0;
1881   ves->reuse_expansion = 0;
1882   return ves;
1883 }
1884
1885 /* Determine whether there is an induction variable in INSN that
1886    we would like to split during unrolling.
1887
1888    I.e. replace
1889
1890    i = i + 1;
1891    ...
1892    i = i + 1;
1893    ...
1894    i = i + 1;
1895    ...
1896
1897    type chains by
1898
1899    i0 = i + 1
1900    ...
1901    i = i0 + 1
1902    ...
1903    i = i0 + 2
1904    ...
1905
1906    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1907    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1908    pointer to it.  */
1909
1910 static struct iv_to_split *
1911 analyze_iv_to_split_insn (rtx insn)
1912 {
1913   rtx set, dest;
1914   struct rtx_iv iv;
1915   struct iv_to_split *ivts;
1916   bool ok;
1917
1918   /* For now we just split the basic induction variables.  Later this may be
1919      extended for example by selecting also addresses of memory references.  */
1920   set = single_set (insn);
1921   if (!set)
1922     return NULL;
1923
1924   dest = SET_DEST (set);
1925   if (!REG_P (dest))
1926     return NULL;
1927
1928   if (!biv_p (insn, dest))
1929     return NULL;
1930
1931   ok = iv_analyze_result (insn, dest, &iv);
1932
1933   /* This used to be an assert under the assumption that if biv_p returns
1934      true that iv_analyze_result must also return true.  However, that
1935      assumption is not strictly correct as evidenced by pr25569.
1936
1937      Returning NULL when iv_analyze_result returns false is safe and
1938      avoids the problems in pr25569 until the iv_analyze_* routines
1939      can be fixed, which is apparently hard and time consuming
1940      according to their author.  */
1941   if (! ok)
1942     return NULL;
1943
1944   if (iv.step == const0_rtx
1945       || iv.mode != iv.extend_mode)
1946     return NULL;
1947
1948   /* Record the insn to split.  */
1949   ivts = XNEW (struct iv_to_split);
1950   ivts->insn = insn;
1951   ivts->orig_var = dest;
1952   ivts->base_var = NULL_RTX;
1953   ivts->step = iv.step;
1954   ivts->next = NULL;
1955   ivts->n_loc = 1;
1956   ivts->loc[0] = 1;
1957
1958   return ivts;
1959 }
1960
1961 /* Determines which of insns in LOOP can be optimized.
1962    Return a OPT_INFO struct with the relevant hash tables filled
1963    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1964    is undefined for the return value.  */
1965
1966 static struct opt_info *
1967 analyze_insns_in_loop (struct loop *loop)
1968 {
1969   basic_block *body, bb;
1970   unsigned i;
1971   struct opt_info *opt_info = XCNEW (struct opt_info);
1972   rtx insn;
1973   struct iv_to_split *ivts = NULL;
1974   struct var_to_expand *ves = NULL;
1975   iv_to_split **slot1;
1976   var_to_expand **slot2;
1977   vec<edge> edges = get_loop_exit_edges (loop);
1978   edge exit;
1979   bool can_apply = false;
1980
1981   iv_analysis_loop_init (loop);
1982
1983   body = get_loop_body (loop);
1984
1985   if (flag_split_ivs_in_unroller)
1986     {
1987       opt_info->insns_to_split.create (5 * loop->num_nodes);
1988       opt_info->iv_to_split_head = NULL;
1989       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1990     }
1991
1992   /* Record the loop exit bb and loop preheader before the unrolling.  */
1993   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1994
1995   if (edges.length () == 1)
1996     {
1997       exit = edges[0];
1998       if (!(exit->flags & EDGE_COMPLEX))
1999         {
2000           opt_info->loop_exit = split_edge (exit);
2001           can_apply = true;
2002         }
2003     }
2004
2005   if (flag_variable_expansion_in_unroller
2006       && can_apply)
2007     {
2008       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
2009       opt_info->var_to_expand_head = NULL;
2010       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
2011     }
2012
2013   for (i = 0; i < loop->num_nodes; i++)
2014     {
2015       bb = body[i];
2016       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2017         continue;
2018
2019       FOR_BB_INSNS (bb, insn)
2020       {
2021         if (!INSN_P (insn))
2022           continue;
2023
2024         if (opt_info->insns_to_split.is_created ())
2025           ivts = analyze_iv_to_split_insn (insn);
2026
2027         if (ivts)
2028           {
2029             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
2030             gcc_assert (*slot1 == NULL);
2031             *slot1 = ivts;
2032             *opt_info->iv_to_split_tail = ivts;
2033             opt_info->iv_to_split_tail = &ivts->next;
2034             continue;
2035           }
2036
2037         if (opt_info->insns_with_var_to_expand.is_created ())
2038           ves = analyze_insn_to_expand_var (loop, insn);
2039
2040         if (ves)
2041           {
2042             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
2043             gcc_assert (*slot2 == NULL);
2044             *slot2 = ves;
2045             *opt_info->var_to_expand_tail = ves;
2046             opt_info->var_to_expand_tail = &ves->next;
2047           }
2048       }
2049     }
2050
2051   edges.release ();
2052   free (body);
2053   return opt_info;
2054 }
2055
2056 /* Called just before loop duplication.  Records start of duplicated area
2057    to OPT_INFO.  */
2058
2059 static void
2060 opt_info_start_duplication (struct opt_info *opt_info)
2061 {
2062   if (opt_info)
2063     opt_info->first_new_block = last_basic_block_for_fn (cfun);
2064 }
2065
2066 /* Determine the number of iterations between initialization of the base
2067    variable and the current copy (N_COPY).  N_COPIES is the total number
2068    of newly created copies.  UNROLLING is true if we are unrolling
2069    (not peeling) the loop.  */
2070
2071 static unsigned
2072 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2073 {
2074   if (unrolling)
2075     {
2076       /* If we are unrolling, initialization is done in the original loop
2077          body (number 0).  */
2078       return n_copy;
2079     }
2080   else
2081     {
2082       /* If we are peeling, the copy in that the initialization occurs has
2083          number 1.  The original loop (number 0) is the last.  */
2084       if (n_copy)
2085         return n_copy - 1;
2086       else
2087         return n_copies;
2088     }
2089 }
2090
2091 /* Locate in EXPR the expression corresponding to the location recorded
2092    in IVTS, and return a pointer to the RTX for this location.  */
2093
2094 static rtx *
2095 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2096 {
2097   unsigned i;
2098   rtx *ret = &expr;
2099
2100   for (i = 0; i < ivts->n_loc; i++)
2101     ret = &XEXP (*ret, ivts->loc[i]);
2102
2103   return ret;
2104 }
2105
2106 /* Allocate basic variable for the induction variable chain.  */
2107
2108 static void
2109 allocate_basic_variable (struct iv_to_split *ivts)
2110 {
2111   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2112
2113   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2114 }
2115
2116 /* Insert initialization of basic variable of IVTS before INSN, taking
2117    the initial value from INSN.  */
2118
2119 static void
2120 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2121 {
2122   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2123   rtx seq;
2124
2125   start_sequence ();
2126   expr = force_operand (expr, ivts->base_var);
2127   if (expr != ivts->base_var)
2128     emit_move_insn (ivts->base_var, expr);
2129   seq = get_insns ();
2130   end_sequence ();
2131
2132   emit_insn_before (seq, insn);
2133 }
2134
2135 /* Replace the use of induction variable described in IVTS in INSN
2136    by base variable + DELTA * step.  */
2137
2138 static void
2139 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2140 {
2141   rtx expr, *loc, seq, incr, var;
2142   enum machine_mode mode = GET_MODE (ivts->base_var);
2143   rtx src, dest, set;
2144
2145   /* Construct base + DELTA * step.  */
2146   if (!delta)
2147     expr = ivts->base_var;
2148   else
2149     {
2150       incr = simplify_gen_binary (MULT, mode,
2151                                   ivts->step, gen_int_mode (delta, mode));
2152       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2153                                   ivts->base_var, incr);
2154     }
2155
2156   /* Figure out where to do the replacement.  */
2157   loc = get_ivts_expr (single_set (insn), ivts);
2158
2159   /* If we can make the replacement right away, we're done.  */
2160   if (validate_change (insn, loc, expr, 0))
2161     return;
2162
2163   /* Otherwise, force EXPR into a register and try again.  */
2164   start_sequence ();
2165   var = gen_reg_rtx (mode);
2166   expr = force_operand (expr, var);
2167   if (expr != var)
2168     emit_move_insn (var, expr);
2169   seq = get_insns ();
2170   end_sequence ();
2171   emit_insn_before (seq, insn);
2172
2173   if (validate_change (insn, loc, var, 0))
2174     return;
2175
2176   /* The last chance.  Try recreating the assignment in insn
2177      completely from scratch.  */
2178   set = single_set (insn);
2179   gcc_assert (set);
2180
2181   start_sequence ();
2182   *loc = var;
2183   src = copy_rtx (SET_SRC (set));
2184   dest = copy_rtx (SET_DEST (set));
2185   src = force_operand (src, dest);
2186   if (src != dest)
2187     emit_move_insn (dest, src);
2188   seq = get_insns ();
2189   end_sequence ();
2190
2191   emit_insn_before (seq, insn);
2192   delete_insn (insn);
2193 }
2194
2195
2196 /* Return one expansion of the accumulator recorded in struct VE.  */
2197
2198 static rtx
2199 get_expansion (struct var_to_expand *ve)
2200 {
2201   rtx reg;
2202
2203   if (ve->reuse_expansion == 0)
2204     reg = ve->reg;
2205   else
2206     reg = ve->var_expansions[ve->reuse_expansion - 1];
2207
2208   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2209     ve->reuse_expansion = 0;
2210   else
2211     ve->reuse_expansion++;
2212
2213   return reg;
2214 }
2215
2216
2217 /* Given INSN replace the uses of the accumulator recorded in VE
2218    with a new register.  */
2219
2220 static void
2221 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2222 {
2223   rtx new_reg, set;
2224   bool really_new_expansion = false;
2225
2226   set = single_set (insn);
2227   gcc_assert (set);
2228
2229   /* Generate a new register only if the expansion limit has not been
2230      reached.  Else reuse an already existing expansion.  */
2231   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2232     {
2233       really_new_expansion = true;
2234       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2235     }
2236   else
2237     new_reg = get_expansion (ve);
2238
2239   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2240   if (apply_change_group ())
2241     if (really_new_expansion)
2242       {
2243         ve->var_expansions.safe_push (new_reg);
2244         ve->expansion_count++;
2245       }
2246 }
2247
2248 /* Initialize the variable expansions in loop preheader.  PLACE is the
2249    loop-preheader basic block where the initialization of the
2250    expansions should take place.  The expansions are initialized with
2251    (-0) when the operation is plus or minus to honor sign zero.  This
2252    way we can prevent cases where the sign of the final result is
2253    effected by the sign of the expansion.  Here is an example to
2254    demonstrate this:
2255
2256    for (i = 0 ; i < n; i++)
2257      sum += something;
2258
2259    ==>
2260
2261    sum += something
2262    ....
2263    i = i+1;
2264    sum1 += something
2265    ....
2266    i = i+1
2267    sum2 += something;
2268    ....
2269
2270    When SUM is initialized with -zero and SOMETHING is also -zero; the
2271    final result of sum should be -zero thus the expansions sum1 and sum2
2272    should be initialized with -zero as well (otherwise we will get +zero
2273    as the final result).  */
2274
2275 static void
2276 insert_var_expansion_initialization (struct var_to_expand *ve,
2277                                      basic_block place)
2278 {
2279   rtx seq, var, zero_init;
2280   unsigned i;
2281   enum machine_mode mode = GET_MODE (ve->reg);
2282   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2283
2284   if (ve->var_expansions.length () == 0)
2285     return;
2286
2287   start_sequence ();
2288   switch (ve->op)
2289     {
2290     case FMA:
2291       /* Note that we only accumulate FMA via the ADD operand.  */
2292     case PLUS:
2293     case MINUS:
2294       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2295         {
2296           if (honor_signed_zero_p)
2297             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2298           else
2299             zero_init = CONST0_RTX (mode);
2300           emit_move_insn (var, zero_init);
2301         }
2302       break;
2303
2304     case MULT:
2305       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2306         {
2307           zero_init = CONST1_RTX (GET_MODE (var));
2308           emit_move_insn (var, zero_init);
2309         }
2310       break;
2311
2312     default:
2313       gcc_unreachable ();
2314     }
2315
2316   seq = get_insns ();
2317   end_sequence ();
2318
2319   emit_insn_after (seq, BB_END (place));
2320 }
2321
2322 /* Combine the variable expansions at the loop exit.  PLACE is the
2323    loop exit basic block where the summation of the expansions should
2324    take place.  */
2325
2326 static void
2327 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2328 {
2329   rtx sum = ve->reg;
2330   rtx expr, seq, var, insn;
2331   unsigned i;
2332
2333   if (ve->var_expansions.length () == 0)
2334     return;
2335
2336   start_sequence ();
2337   switch (ve->op)
2338     {
2339     case FMA:
2340       /* Note that we only accumulate FMA via the ADD operand.  */
2341     case PLUS:
2342     case MINUS:
2343       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2344         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2345       break;
2346
2347     case MULT:
2348       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2349         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2350       break;
2351
2352     default:
2353       gcc_unreachable ();
2354     }
2355
2356   expr = force_operand (sum, ve->reg);
2357   if (expr != ve->reg)
2358     emit_move_insn (ve->reg, expr);
2359   seq = get_insns ();
2360   end_sequence ();
2361
2362   insn = BB_HEAD (place);
2363   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2364     insn = NEXT_INSN (insn);
2365
2366   emit_insn_after (seq, insn);
2367 }
2368
2369 /* Strip away REG_EQUAL notes for IVs we're splitting.
2370
2371    Updating REG_EQUAL notes for IVs we split is tricky: We
2372    cannot tell until after unrolling, DF-rescanning, and liveness
2373    updating, whether an EQ_USE is reached by the split IV while
2374    the IV reg is still live.  See PR55006.
2375
2376    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2377    because RTL loop-iv requires us to defer rescanning insns and
2378    any notes attached to them.  So resort to old techniques...  */
2379
2380 static void
2381 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2382 {
2383   struct iv_to_split *ivts;
2384   rtx note = find_reg_equal_equiv_note (insn);
2385   if (! note)
2386     return;
2387   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2388     if (reg_mentioned_p (ivts->orig_var, note))
2389       {
2390         remove_note (insn, note);
2391         return;
2392       }
2393 }
2394
2395 /* Apply loop optimizations in loop copies using the
2396    data which gathered during the unrolling.  Structure
2397    OPT_INFO record that data.
2398
2399    UNROLLING is true if we unrolled (not peeled) the loop.
2400    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2401    the loop (as it should happen in complete unrolling, but not in ordinary
2402    peeling of the loop).  */
2403
2404 static void
2405 apply_opt_in_copies (struct opt_info *opt_info,
2406                      unsigned n_copies, bool unrolling,
2407                      bool rewrite_original_loop)
2408 {
2409   unsigned i, delta;
2410   basic_block bb, orig_bb;
2411   rtx insn, orig_insn, next;
2412   struct iv_to_split ivts_templ, *ivts;
2413   struct var_to_expand ve_templ, *ves;
2414
2415   /* Sanity check -- we need to put initialization in the original loop
2416      body.  */
2417   gcc_assert (!unrolling || rewrite_original_loop);
2418
2419   /* Allocate the basic variables (i0).  */
2420   if (opt_info->insns_to_split.is_created ())
2421     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2422       allocate_basic_variable (ivts);
2423
2424   for (i = opt_info->first_new_block;
2425        i < (unsigned) last_basic_block_for_fn (cfun);
2426        i++)
2427     {
2428       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2429       orig_bb = get_bb_original (bb);
2430
2431       /* bb->aux holds position in copy sequence initialized by
2432          duplicate_loop_to_header_edge.  */
2433       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2434                                         unrolling);
2435       bb->aux = 0;
2436       orig_insn = BB_HEAD (orig_bb);
2437       FOR_BB_INSNS_SAFE (bb, insn, next)
2438         {
2439           if (!INSN_P (insn)
2440               || (DEBUG_INSN_P (insn)
2441                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2442             continue;
2443
2444           while (!INSN_P (orig_insn)
2445                  || (DEBUG_INSN_P (orig_insn)
2446                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2447                          == LABEL_DECL)))
2448             orig_insn = NEXT_INSN (orig_insn);
2449
2450           ivts_templ.insn = orig_insn;
2451           ve_templ.insn = orig_insn;
2452
2453           /* Apply splitting iv optimization.  */
2454           if (opt_info->insns_to_split.is_created ())
2455             {
2456               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2457
2458               ivts = opt_info->insns_to_split.find (&ivts_templ);
2459
2460               if (ivts)
2461                 {
2462                   gcc_assert (GET_CODE (PATTERN (insn))
2463                               == GET_CODE (PATTERN (orig_insn)));
2464
2465                   if (!delta)
2466                     insert_base_initialization (ivts, insn);
2467                   split_iv (ivts, insn, delta);
2468                 }
2469             }
2470           /* Apply variable expansion optimization.  */
2471           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2472             {
2473               ves = (struct var_to_expand *)
2474                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2475               if (ves)
2476                 {
2477                   gcc_assert (GET_CODE (PATTERN (insn))
2478                               == GET_CODE (PATTERN (orig_insn)));
2479                   expand_var_during_unrolling (ves, insn);
2480                 }
2481             }
2482           orig_insn = NEXT_INSN (orig_insn);
2483         }
2484     }
2485
2486   if (!rewrite_original_loop)
2487     return;
2488
2489   /* Initialize the variable expansions in the loop preheader
2490      and take care of combining them at the loop exit.  */
2491   if (opt_info->insns_with_var_to_expand.is_created ())
2492     {
2493       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2494         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2495       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2496         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2497     }
2498
2499   /* Rewrite also the original loop body.  Find them as originals of the blocks
2500      in the last copied iteration, i.e. those that have
2501      get_bb_copy (get_bb_original (bb)) == bb.  */
2502   for (i = opt_info->first_new_block;
2503        i < (unsigned) last_basic_block_for_fn (cfun);
2504        i++)
2505     {
2506       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2507       orig_bb = get_bb_original (bb);
2508       if (get_bb_copy (orig_bb) != bb)
2509         continue;
2510
2511       delta = determine_split_iv_delta (0, n_copies, unrolling);
2512       for (orig_insn = BB_HEAD (orig_bb);
2513            orig_insn != NEXT_INSN (BB_END (bb));
2514            orig_insn = next)
2515         {
2516           next = NEXT_INSN (orig_insn);
2517
2518           if (!INSN_P (orig_insn))
2519             continue;
2520
2521           ivts_templ.insn = orig_insn;
2522           if (opt_info->insns_to_split.is_created ())
2523             {
2524               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2525
2526               ivts = (struct iv_to_split *)
2527                 opt_info->insns_to_split.find (&ivts_templ);
2528               if (ivts)
2529                 {
2530                   if (!delta)
2531                     insert_base_initialization (ivts, orig_insn);
2532                   split_iv (ivts, orig_insn, delta);
2533                   continue;
2534                 }
2535             }
2536
2537         }
2538     }
2539 }
2540
2541 /* Release OPT_INFO.  */
2542
2543 static void
2544 free_opt_info (struct opt_info *opt_info)
2545 {
2546   if (opt_info->insns_to_split.is_created ())
2547     opt_info->insns_to_split.dispose ();
2548   if (opt_info->insns_with_var_to_expand.is_created ())
2549     {
2550       struct var_to_expand *ves;
2551
2552       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2553         ves->var_expansions.release ();
2554       opt_info->insns_with_var_to_expand.dispose ();
2555     }
2556   free (opt_info);
2557 }