gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hash-table.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Hashtable helper for iv_to_split.  */
 106
 107 struct iv_split_hasher : typed_free_remove <iv_to_split>
 108 {
 109   typedef iv_to_split value_type;
 110   typedef iv_to_split compare_type;
 111   static inline hashval_t hash (const value_type *);
 112   static inline bool equal (const value_type *, const compare_type *);
 113 };
 114
 115
 116 /* A hash function for information about insns to split.  */
 117
 118 inline hashval_t
 119 iv_split_hasher::hash (const value_type *ivts)
 120 {
 121   return (hashval_t) INSN_UID (ivts->insn);
 122 }
 123
 124 /* An equality functions for information about insns to split.  */
 125
 126 inline bool
 127 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 128 {
 129   return i1->insn == i2->insn;
 130 }
 131
 132 /* Hashtable helper for iv_to_split.  */
 133
 134 struct var_expand_hasher : typed_free_remove <var_to_expand>
 135 {
 136   typedef var_to_expand value_type;
 137   typedef var_to_expand compare_type;
 138   static inline hashval_t hash (const value_type *);
 139   static inline bool equal (const value_type *, const compare_type *);
 140 };
 141
 142 /* Return a hash for VES.  */
 143
 144 inline hashval_t
 145 var_expand_hasher::hash (const value_type *ves)
 146 {
 147   return (hashval_t) INSN_UID (ves->insn);
 148 }
 149
 150 /* Return true if I1 and I2 refer to the same instruction.  */
 151
 152 inline bool
 153 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 154 {
 155   return i1->insn == i2->insn;
 156 }
 157
 158 /* Information about optimization applied in
 159    the unrolled loop.  */
 160
 161 struct opt_info
 162 {
 163   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 164                                                   split.  */
 165   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 166   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 167   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 168                                         insns with accumulators to expand.  */
 169   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 170   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 171   unsigned first_new_block;        /* The first basic block that was
 172                                       duplicated.  */
 173   basic_block loop_exit;           /* The loop exit basic block.  */
 174   basic_block loop_preheader;      /* The loop preheader basic block.  */
 175 };
 176
 177 static void decide_unrolling_and_peeling (int);
 178 static void peel_loops_completely (int);
 179 static void decide_peel_simple (struct loop *, int);
 180 static void decide_peel_once_rolling (struct loop *, int);
 181 static void decide_peel_completely (struct loop *, int);
 182 static void decide_unroll_stupid (struct loop *, int);
 183 static void decide_unroll_constant_iterations (struct loop *, int);
 184 static void decide_unroll_runtime_iterations (struct loop *, int);
 185 static void peel_loop_simple (struct loop *);
 186 static void peel_loop_completely (struct loop *);
 187 static void unroll_loop_stupid (struct loop *);
 188 static void unroll_loop_constant_iterations (struct loop *);
 189 static void unroll_loop_runtime_iterations (struct loop *);
 190 static struct opt_info *analyze_insns_in_loop (struct loop *);
 191 static void opt_info_start_duplication (struct opt_info *);
 192 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 193 static void free_opt_info (struct opt_info *);
 194 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 195 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 196 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 197 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 198 static void insert_var_expansion_initialization (struct var_to_expand *,
 199                                                  basic_block);
 200 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 201                                              basic_block);
 202 static rtx get_expansion (struct var_to_expand *);
 203
 204 /* Emit a message summarizing the unroll or peel that will be
 205    performed for LOOP, along with the loop's location LOCUS, if
 206    appropriate given the dump or -fopt-info settings.  */
 207
 208 static void
 209 report_unroll_peel (struct loop *loop, location_t locus)
 210 {
 211   struct niter_desc *desc;
 212   int niters = 0;
 213   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 214
 215   if (loop->lpt_decision.decision == LPT_NONE)
 216     return;
 217
 218   if (!dump_enabled_p ())
 219     return;
 220
 221   /* In the special case where the loop never iterated, emit
 222      a different message so that we don't report an unroll by 0.
 223      This matches the equivalent message emitted during tree unrolling.  */
 224   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 225       && !loop->lpt_decision.times)
 226     {
 227       dump_printf_loc (report_flags, locus,
 228                        "Turned loop into non-loop; it never loops.\n");
 229       return;
 230     }
 231
 232   desc = get_simple_loop_desc (loop);
 233
 234   if (desc->const_iter)
 235     niters = desc->niter;
 236   else if (loop->header->count)
 237     niters = expected_loop_iterations (loop);
 238
 239   dump_printf_loc (report_flags, locus,
 240                    "%s loop %d times",
 241                    (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 242                     ?  "Completely unroll"
 243                     : (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 244                        ? "Peel" : "Unroll")),
 245                    loop->lpt_decision.times);
 246   if (profile_info)
 247     dump_printf (report_flags,
 248                  " (header execution count %d",
 249                  (int)loop->header->count);
 250   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 251     dump_printf (report_flags,
 252                  "%s%s iterations %d)",
 253                  profile_info ? ", " : " (",
 254                  desc->const_iter ? "const" : "average",
 255                  niters);
 256   else if (profile_info)
 257     dump_printf (report_flags, ")");
 258
 259   dump_printf (report_flags, "\n");
 260 }
 261
 262 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 263 void
 264 unroll_and_peel_loops (int flags)
 265 {
 266   struct loop *loop;
 267   bool changed = false;
 268   loop_iterator li;
 269
 270   /* First perform complete loop peeling (it is almost surely a win,
 271      and affects parameters for further decision a lot).  */
 272   peel_loops_completely (flags);
 273
 274   /* Now decide rest of unrolling and peeling.  */
 275   decide_unrolling_and_peeling (flags);
 276
 277   /* Scan the loops, inner ones first.  */
 278   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 279     {
 280       /* And perform the appropriate transformations.  */
 281       switch (loop->lpt_decision.decision)
 282         {
 283         case LPT_PEEL_COMPLETELY:
 284           /* Already done.  */
 285           gcc_unreachable ();
 286         case LPT_PEEL_SIMPLE:
 287           peel_loop_simple (loop);
 288           changed = true;
 289           break;
 290         case LPT_UNROLL_CONSTANT:
 291           unroll_loop_constant_iterations (loop);
 292           changed = true;
 293           break;
 294         case LPT_UNROLL_RUNTIME:
 295           unroll_loop_runtime_iterations (loop);
 296           changed = true;
 297           break;
 298         case LPT_UNROLL_STUPID:
 299           unroll_loop_stupid (loop);
 300           changed = true;
 301           break;
 302         case LPT_NONE:
 303           break;
 304         default:
 305           gcc_unreachable ();
 306         }
 307     }
 308
 309     if (changed)
 310       {
 311         calculate_dominance_info (CDI_DOMINATORS);
 312         fix_loop_structure (NULL);
 313       }
 314
 315   iv_analysis_done ();
 316 }
 317
 318 /* Check whether exit of the LOOP is at the end of loop body.  */
 319
 320 static bool
 321 loop_exit_at_end_p (struct loop *loop)
 322 {
 323   struct niter_desc *desc = get_simple_loop_desc (loop);
 324   rtx insn;
 325
 326   if (desc->in_edge->dest != loop->latch)
 327     return false;
 328
 329   /* Check that the latch is empty.  */
 330   FOR_BB_INSNS (loop->latch, insn)
 331     {
 332       if (NONDEBUG_INSN_P (insn))
 333         return false;
 334     }
 335
 336   return true;
 337 }
 338
 339 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 340 static void
 341 peel_loops_completely (int flags)
 342 {
 343   struct loop *loop;
 344   loop_iterator li;
 345   bool changed = false;
 346
 347   /* Scan the loops, the inner ones first.  */
 348   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 349     {
 350       loop->lpt_decision.decision = LPT_NONE;
 351       location_t locus = get_loop_location (loop);
 352
 353       if (dump_enabled_p ())
 354         dump_printf_loc (TDF_RTL, locus,
 355                          ";; *** Considering loop %d at BB %d for "
 356                          "complete peeling ***\n",
 357                          loop->num, loop->header->index);
 358
 359       loop->ninsns = num_loop_insns (loop);
 360
 361       decide_peel_once_rolling (loop, flags);
 362       if (loop->lpt_decision.decision == LPT_NONE)
 363         decide_peel_completely (loop, flags);
 364
 365       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 366         {
 367           report_unroll_peel (loop, locus);
 368           peel_loop_completely (loop);
 369           changed = true;
 370         }
 371     }
 372
 373     if (changed)
 374       {
 375         calculate_dominance_info (CDI_DOMINATORS);
 376         fix_loop_structure (NULL);
 377       }
 378 }
 379
 380 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 381 static void
 382 decide_unrolling_and_peeling (int flags)
 383 {
 384   struct loop *loop;
 385   loop_iterator li;
 386
 387   /* Scan the loops, inner ones first.  */
 388   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 389     {
 390       loop->lpt_decision.decision = LPT_NONE;
 391       location_t locus = get_loop_location (loop);
 392
 393       if (dump_enabled_p ())
 394         dump_printf_loc (TDF_RTL, locus,
 395                          ";; *** Considering loop %d at BB %d for "
 396                          "unrolling and peeling ***\n",
 397                          loop->num, loop->header->index);
 398
 399       /* Do not peel cold areas.  */
 400       if (optimize_loop_for_size_p (loop))
 401         {
 402           if (dump_file)
 403             fprintf (dump_file, ";; Not considering loop, cold area\n");
 404           continue;
 405         }
 406
 407       /* Can the loop be manipulated?  */
 408       if (!can_duplicate_loop_p (loop))
 409         {
 410           if (dump_file)
 411             fprintf (dump_file,
 412                      ";; Not considering loop, cannot duplicate\n");
 413           continue;
 414         }
 415
 416       /* Skip non-innermost loops.  */
 417       if (loop->inner)
 418         {
 419           if (dump_file)
 420             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 421           continue;
 422         }
 423
 424       loop->ninsns = num_loop_insns (loop);
 425       loop->av_ninsns = average_num_loop_insns (loop);
 426
 427       /* Try transformations one by one in decreasing order of
 428          priority.  */
 429
 430       decide_unroll_constant_iterations (loop, flags);
 431       if (loop->lpt_decision.decision == LPT_NONE)
 432         decide_unroll_runtime_iterations (loop, flags);
 433       if (loop->lpt_decision.decision == LPT_NONE)
 434         decide_unroll_stupid (loop, flags);
 435       if (loop->lpt_decision.decision == LPT_NONE)
 436         decide_peel_simple (loop, flags);
 437
 438       report_unroll_peel (loop, locus);
 439     }
 440 }
 441
 442 /* Decide whether the LOOP is once rolling and suitable for complete
 443    peeling.  */
 444 static void
 445 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 446 {
 447   struct niter_desc *desc;
 448
 449   if (dump_file)
 450     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 451
 452   /* Is the loop small enough?  */
 453   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 454     {
 455       if (dump_file)
 456         fprintf (dump_file, ";; Not considering loop, is too big\n");
 457       return;
 458     }
 459
 460   /* Check for simple loops.  */
 461   desc = get_simple_loop_desc (loop);
 462
 463   /* Check number of iterations.  */
 464   if (!desc->simple_p
 465       || desc->assumptions
 466       || desc->infinite
 467       || !desc->const_iter
 468       || (desc->niter != 0
 469           && max_loop_iterations_int (loop) != 0))
 470     {
 471       if (dump_file)
 472         fprintf (dump_file,
 473                  ";; Unable to prove that the loop rolls exactly once\n");
 474       return;
 475     }
 476
 477   /* Success.  */
 478   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 479 }
 480
 481 /* Decide whether the LOOP is suitable for complete peeling.  */
 482 static void
 483 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 484 {
 485   unsigned npeel;
 486   struct niter_desc *desc;
 487
 488   if (dump_file)
 489     fprintf (dump_file, "\n;; Considering peeling completely\n");
 490
 491   /* Skip non-innermost loops.  */
 492   if (loop->inner)
 493     {
 494       if (dump_file)
 495         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 496       return;
 497     }
 498
 499   /* Do not peel cold areas.  */
 500   if (optimize_loop_for_size_p (loop))
 501     {
 502       if (dump_file)
 503         fprintf (dump_file, ";; Not considering loop, cold area\n");
 504       return;
 505     }
 506
 507   /* Can the loop be manipulated?  */
 508   if (!can_duplicate_loop_p (loop))
 509     {
 510       if (dump_file)
 511         fprintf (dump_file,
 512                  ";; Not considering loop, cannot duplicate\n");
 513       return;
 514     }
 515
 516   /* npeel = number of iterations to peel.  */
 517   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 518   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 519     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 520
 521   /* Is the loop small enough?  */
 522   if (!npeel)
 523     {
 524       if (dump_file)
 525         fprintf (dump_file, ";; Not considering loop, is too big\n");
 526       return;
 527     }
 528
 529   /* Check for simple loops.  */
 530   desc = get_simple_loop_desc (loop);
 531
 532   /* Check number of iterations.  */
 533   if (!desc->simple_p
 534       || desc->assumptions
 535       || !desc->const_iter
 536       || desc->infinite)
 537     {
 538       if (dump_file)
 539         fprintf (dump_file,
 540                  ";; Unable to prove that the loop iterates constant times\n");
 541       return;
 542     }
 543
 544   if (desc->niter > npeel - 1)
 545     {
 546       if (dump_file)
 547         {
 548           fprintf (dump_file,
 549                    ";; Not peeling loop completely, rolls too much (");
 550           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 551           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 552         }
 553       return;
 554     }
 555
 556   /* Success.  */
 557   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 558 }
 559
 560 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 561    completely.  The transformation done:
 562
 563    for (i = 0; i < 4; i++)
 564      body;
 565
 566    ==>
 567
 568    i = 0;
 569    body; i++;
 570    body; i++;
 571    body; i++;
 572    body; i++;
 573    */
 574 static void
 575 peel_loop_completely (struct loop *loop)
 576 {
 577   sbitmap wont_exit;
 578   unsigned HOST_WIDE_INT npeel;
 579   unsigned i;
 580   vec<edge> remove_edges;
 581   edge ein;
 582   struct niter_desc *desc = get_simple_loop_desc (loop);
 583   struct opt_info *opt_info = NULL;
 584
 585   npeel = desc->niter;
 586
 587   if (npeel)
 588     {
 589       bool ok;
 590
 591       wont_exit = sbitmap_alloc (npeel + 1);
 592       bitmap_ones (wont_exit);
 593       bitmap_clear_bit (wont_exit, 0);
 594       if (desc->noloop_assumptions)
 595         bitmap_clear_bit (wont_exit, 1);
 596
 597       remove_edges.create (0);
 598
 599       if (flag_split_ivs_in_unroller)
 600         opt_info = analyze_insns_in_loop (loop);
 601
 602       opt_info_start_duplication (opt_info);
 603       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 604                                           npeel,
 605                                           wont_exit, desc->out_edge,
 606                                           &remove_edges,
 607                                           DLTHE_FLAG_UPDATE_FREQ
 608                                           | DLTHE_FLAG_COMPLETTE_PEEL
 609                                           | (opt_info
 610                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 611       gcc_assert (ok);
 612
 613       free (wont_exit);
 614
 615       if (opt_info)
 616         {
 617           apply_opt_in_copies (opt_info, npeel, false, true);
 618           free_opt_info (opt_info);
 619         }
 620
 621       /* Remove the exit edges.  */
 622       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 623         remove_path (ein);
 624       remove_edges.release ();
 625     }
 626
 627   ein = desc->in_edge;
 628   free_simple_loop_desc (loop);
 629
 630   /* Now remove the unreachable part of the last iteration and cancel
 631      the loop.  */
 632   remove_path (ein);
 633
 634   if (dump_file)
 635     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 636 }
 637
 638 /* Decide whether to unroll LOOP iterating constant number of times
 639    and how much.  */
 640
 641 static void
 642 decide_unroll_constant_iterations (struct loop *loop, int flags)
 643 {
 644   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 645   struct niter_desc *desc;
 646   double_int iterations;
 647
 648   if (!(flags & UAP_UNROLL))
 649     {
 650       /* We were not asked to, just return back silently.  */
 651       return;
 652     }
 653
 654   if (dump_file)
 655     fprintf (dump_file,
 656              "\n;; Considering unrolling loop with constant "
 657              "number of iterations\n");
 658
 659   /* nunroll = total number of copies of the original loop body in
 660      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 661   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 662   nunroll_by_av
 663     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 664   if (nunroll > nunroll_by_av)
 665     nunroll = nunroll_by_av;
 666   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 667     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 668
 669   /* Skip big loops.  */
 670   if (nunroll <= 1)
 671     {
 672       if (dump_file)
 673         fprintf (dump_file, ";; Not considering loop, is too big\n");
 674       return;
 675     }
 676
 677   /* Check for simple loops.  */
 678   desc = get_simple_loop_desc (loop);
 679
 680   /* Check number of iterations.  */
 681   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 682     {
 683       if (dump_file)
 684         fprintf (dump_file,
 685                  ";; Unable to prove that the loop iterates constant times\n");
 686       return;
 687     }
 688
 689   /* Check whether the loop rolls enough to consider.
 690      Consult also loop bounds and profile; in the case the loop has more
 691      than one exit it may well loop less than determined maximal number
 692      of iterations.  */
 693   if (desc->niter < 2 * nunroll
 694       || ((estimated_loop_iterations (loop, &iterations)
 695            || max_loop_iterations (loop, &iterations))
 696           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 697     {
 698       if (dump_file)
 699         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 700       return;
 701     }
 702
 703   /* Success; now compute number of iterations to unroll.  We alter
 704      nunroll so that as few as possible copies of loop body are
 705      necessary, while still not decreasing the number of unrollings
 706      too much (at most by 1).  */
 707   best_copies = 2 * nunroll + 10;
 708
 709   i = 2 * nunroll + 2;
 710   if (i - 1 >= desc->niter)
 711     i = desc->niter - 2;
 712
 713   for (; i >= nunroll - 1; i--)
 714     {
 715       unsigned exit_mod = desc->niter % (i + 1);
 716
 717       if (!loop_exit_at_end_p (loop))
 718         n_copies = exit_mod + i + 1;
 719       else if (exit_mod != (unsigned) i
 720                || desc->noloop_assumptions != NULL_RTX)
 721         n_copies = exit_mod + i + 2;
 722       else
 723         n_copies = i + 1;
 724
 725       if (n_copies < best_copies)
 726         {
 727           best_copies = n_copies;
 728           best_unroll = i;
 729         }
 730     }
 731
 732   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 733   loop->lpt_decision.times = best_unroll;
 734 }
 735
 736 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 737    The transformation does this:
 738
 739    for (i = 0; i < 102; i++)
 740      body;
 741
 742    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 743
 744    i = 0;
 745    body; i++;
 746    body; i++;
 747    while (i < 102)
 748      {
 749        body; i++;
 750        body; i++;
 751        body; i++;
 752        body; i++;
 753      }
 754   */
 755 static void
 756 unroll_loop_constant_iterations (struct loop *loop)
 757 {
 758   unsigned HOST_WIDE_INT niter;
 759   unsigned exit_mod;
 760   sbitmap wont_exit;
 761   unsigned i;
 762   vec<edge> remove_edges;
 763   edge e;
 764   unsigned max_unroll = loop->lpt_decision.times;
 765   struct niter_desc *desc = get_simple_loop_desc (loop);
 766   bool exit_at_end = loop_exit_at_end_p (loop);
 767   struct opt_info *opt_info = NULL;
 768   bool ok;
 769
 770   niter = desc->niter;
 771
 772   /* Should not get here (such loop should be peeled instead).  */
 773   gcc_assert (niter > max_unroll + 1);
 774
 775   exit_mod = niter % (max_unroll + 1);
 776
 777   wont_exit = sbitmap_alloc (max_unroll + 1);
 778   bitmap_ones (wont_exit);
 779
 780   remove_edges.create (0);
 781   if (flag_split_ivs_in_unroller
 782       || flag_variable_expansion_in_unroller)
 783     opt_info = analyze_insns_in_loop (loop);
 784
 785   if (!exit_at_end)
 786     {
 787       /* The exit is not at the end of the loop; leave exit test
 788          in the first copy, so that the loops that start with test
 789          of exit condition have continuous body after unrolling.  */
 790
 791       if (dump_file)
 792         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 793
 794       /* Peel exit_mod iterations.  */
 795       bitmap_clear_bit (wont_exit, 0);
 796       if (desc->noloop_assumptions)
 797         bitmap_clear_bit (wont_exit, 1);
 798
 799       if (exit_mod)
 800         {
 801           opt_info_start_duplication (opt_info);
 802           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 803                                               exit_mod,
 804                                               wont_exit, desc->out_edge,
 805                                               &remove_edges,
 806                                               DLTHE_FLAG_UPDATE_FREQ
 807                                               | (opt_info && exit_mod > 1
 808                                                  ? DLTHE_RECORD_COPY_NUMBER
 809                                                    : 0));
 810           gcc_assert (ok);
 811
 812           if (opt_info && exit_mod > 1)
 813             apply_opt_in_copies (opt_info, exit_mod, false, false);
 814
 815           desc->noloop_assumptions = NULL_RTX;
 816           desc->niter -= exit_mod;
 817           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 818           if (loop->any_estimate
 819               && double_int::from_uhwi (exit_mod).ule
 820                    (loop->nb_iterations_estimate))
 821             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 822           else
 823             loop->any_estimate = false;
 824         }
 825
 826       bitmap_set_bit (wont_exit, 1);
 827     }
 828   else
 829     {
 830       /* Leave exit test in last copy, for the same reason as above if
 831          the loop tests the condition at the end of loop body.  */
 832
 833       if (dump_file)
 834         fprintf (dump_file, ";; Condition at end of loop.\n");
 835
 836       /* We know that niter >= max_unroll + 2; so we do not need to care of
 837          case when we would exit before reaching the loop.  So just peel
 838          exit_mod + 1 iterations.  */
 839       if (exit_mod != max_unroll
 840           || desc->noloop_assumptions)
 841         {
 842           bitmap_clear_bit (wont_exit, 0);
 843           if (desc->noloop_assumptions)
 844             bitmap_clear_bit (wont_exit, 1);
 845
 846           opt_info_start_duplication (opt_info);
 847           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 848                                               exit_mod + 1,
 849                                               wont_exit, desc->out_edge,
 850                                               &remove_edges,
 851                                               DLTHE_FLAG_UPDATE_FREQ
 852                                               | (opt_info && exit_mod > 0
 853                                                  ? DLTHE_RECORD_COPY_NUMBER
 854                                                    : 0));
 855           gcc_assert (ok);
 856
 857           if (opt_info && exit_mod > 0)
 858             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 859
 860           desc->niter -= exit_mod + 1;
 861           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 862           if (loop->any_estimate
 863               && double_int::from_uhwi (exit_mod + 1).ule
 864                    (loop->nb_iterations_estimate))
 865             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 866           else
 867             loop->any_estimate = false;
 868           desc->noloop_assumptions = NULL_RTX;
 869
 870           bitmap_set_bit (wont_exit, 0);
 871           bitmap_set_bit (wont_exit, 1);
 872         }
 873
 874       bitmap_clear_bit (wont_exit, max_unroll);
 875     }
 876
 877   /* Now unroll the loop.  */
 878
 879   opt_info_start_duplication (opt_info);
 880   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 881                                       max_unroll,
 882                                       wont_exit, desc->out_edge,
 883                                       &remove_edges,
 884                                       DLTHE_FLAG_UPDATE_FREQ
 885                                       | (opt_info
 886                                          ? DLTHE_RECORD_COPY_NUMBER
 887                                            : 0));
 888   gcc_assert (ok);
 889
 890   if (opt_info)
 891     {
 892       apply_opt_in_copies (opt_info, max_unroll, true, true);
 893       free_opt_info (opt_info);
 894     }
 895
 896   free (wont_exit);
 897
 898   if (exit_at_end)
 899     {
 900       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 901       /* Find a new in and out edge; they are in the last copy we have made.  */
 902
 903       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 904         {
 905           desc->out_edge = EDGE_SUCC (exit_block, 0);
 906           desc->in_edge = EDGE_SUCC (exit_block, 1);
 907         }
 908       else
 909         {
 910           desc->out_edge = EDGE_SUCC (exit_block, 1);
 911           desc->in_edge = EDGE_SUCC (exit_block, 0);
 912         }
 913     }
 914
 915   desc->niter /= max_unroll + 1;
 916   loop->nb_iterations_upper_bound
 917     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 918                                                                    + 1),
 919                                             TRUNC_DIV_EXPR);
 920   if (loop->any_estimate)
 921     loop->nb_iterations_estimate
 922       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 923                                                                   + 1),
 924                                            TRUNC_DIV_EXPR);
 925   desc->niter_expr = GEN_INT (desc->niter);
 926
 927   /* Remove the edges.  */
 928   FOR_EACH_VEC_ELT (remove_edges, i, e)
 929     remove_path (e);
 930   remove_edges.release ();
 931
 932   if (dump_file)
 933     fprintf (dump_file,
 934              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 935              max_unroll, num_loop_insns (loop));
 936 }
 937
 938 /* Decide whether to unroll LOOP iterating runtime computable number of times
 939    and how much.  */
 940 static void
 941 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 942 {
 943   unsigned nunroll, nunroll_by_av, i;
 944   struct niter_desc *desc;
 945   double_int iterations;
 946
 947   if (!(flags & UAP_UNROLL))
 948     {
 949       /* We were not asked to, just return back silently.  */
 950       return;
 951     }
 952
 953   if (dump_file)
 954     fprintf (dump_file,
 955              "\n;; Considering unrolling loop with runtime "
 956              "computable number of iterations\n");
 957
 958   /* nunroll = total number of copies of the original loop body in
 959      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 960   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 961   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 962   if (nunroll > nunroll_by_av)
 963     nunroll = nunroll_by_av;
 964   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 965     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 966
 967   if (targetm.loop_unroll_adjust)
 968     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 969
 970   /* Skip big loops.  */
 971   if (nunroll <= 1)
 972     {
 973       if (dump_file)
 974         fprintf (dump_file, ";; Not considering loop, is too big\n");
 975       return;
 976     }
 977
 978   /* Check for simple loops.  */
 979   desc = get_simple_loop_desc (loop);
 980
 981   /* Check simpleness.  */
 982   if (!desc->simple_p || desc->assumptions)
 983     {
 984       if (dump_file)
 985         fprintf (dump_file,
 986                  ";; Unable to prove that the number of iterations "
 987                  "can be counted in runtime\n");
 988       return;
 989     }
 990
 991   if (desc->const_iter)
 992     {
 993       if (dump_file)
 994         fprintf (dump_file, ";; Loop iterates constant times\n");
 995       return;
 996     }
 997
 998   /* Check whether the loop rolls.  */
 999   if ((estimated_loop_iterations (loop, &iterations)
1000        || max_loop_iterations (loop, &iterations))
1001       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1002     {
1003       if (dump_file)
1004         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1005       return;
1006     }
1007
1008   /* Success; now force nunroll to be power of 2, as we are unable to
1009      cope with overflows in computation of number of iterations.  */
1010   for (i = 1; 2 * i <= nunroll; i *= 2)
1011     continue;
1012
1013   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1014   loop->lpt_decision.times = i - 1;
1015 }
1016
1017 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1018    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1019    and NULL is returned instead.  */
1020
1021 basic_block
1022 split_edge_and_insert (edge e, rtx insns)
1023 {
1024   basic_block bb;
1025
1026   if (!insns)
1027     return NULL;
1028   bb = split_edge (e);
1029   emit_insn_after (insns, BB_END (bb));
1030
1031   /* ??? We used to assume that INSNS can contain control flow insns, and
1032      that we had to try to find sub basic blocks in BB to maintain a valid
1033      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1034      and call break_superblocks when going out of cfglayout mode.  But it
1035      turns out that this never happens; and that if it does ever happen,
1036      the TODO_verify_flow at the end of the RTL loop passes would fail.
1037
1038      There are two reasons why we expected we could have control flow insns
1039      in INSNS.  The first is when a comparison has to be done in parts, and
1040      the second is when the number of iterations is computed for loops with
1041      the number of iterations known at runtime.  In both cases, test cases
1042      to get control flow in INSNS appear to be impossible to construct:
1043
1044       * If do_compare_rtx_and_jump needs several branches to do comparison
1045         in a mode that needs comparison by parts, we cannot analyze the
1046         number of iterations of the loop, and we never get to unrolling it.
1047
1048       * The code in expand_divmod that was suspected to cause creation of
1049         branching code seems to be only accessed for signed division.  The
1050         divisions used by # of iterations analysis are always unsigned.
1051         Problems might arise on architectures that emits branching code
1052         for some operations that may appear in the unroller (especially
1053         for division), but we have no such architectures.
1054
1055      Considering all this, it was decided that we should for now assume
1056      that INSNS can in theory contain control flow insns, but in practice
1057      it never does.  So we don't handle the theoretical case, and should
1058      a real failure ever show up, we have a pretty good clue for how to
1059      fix it.  */
1060
1061   return bb;
1062 }
1063
1064 /* Unroll LOOP for which we are able to count number of iterations in runtime
1065    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1066    extra care for case n < 0):
1067
1068    for (i = 0; i < n; i++)
1069      body;
1070
1071    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1072
1073    i = 0;
1074    mod = n % 4;
1075
1076    switch (mod)
1077      {
1078        case 3:
1079          body; i++;
1080        case 2:
1081          body; i++;
1082        case 1:
1083          body; i++;
1084        case 0: ;
1085      }
1086
1087    while (i < n)
1088      {
1089        body; i++;
1090        body; i++;
1091        body; i++;
1092        body; i++;
1093      }
1094    */
1095 static void
1096 unroll_loop_runtime_iterations (struct loop *loop)
1097 {
1098   rtx old_niter, niter, init_code, branch_code, tmp;
1099   unsigned i, j, p;
1100   basic_block preheader, *body, swtch, ezc_swtch;
1101   vec<basic_block> dom_bbs;
1102   sbitmap wont_exit;
1103   int may_exit_copy;
1104   unsigned n_peel;
1105   vec<edge> remove_edges;
1106   edge e;
1107   bool extra_zero_check, last_may_exit;
1108   unsigned max_unroll = loop->lpt_decision.times;
1109   struct niter_desc *desc = get_simple_loop_desc (loop);
1110   bool exit_at_end = loop_exit_at_end_p (loop);
1111   struct opt_info *opt_info = NULL;
1112   bool ok;
1113
1114   if (flag_split_ivs_in_unroller
1115       || flag_variable_expansion_in_unroller)
1116     opt_info = analyze_insns_in_loop (loop);
1117
1118   /* Remember blocks whose dominators will have to be updated.  */
1119   dom_bbs.create (0);
1120
1121   body = get_loop_body (loop);
1122   for (i = 0; i < loop->num_nodes; i++)
1123     {
1124       vec<basic_block> ldom;
1125       basic_block bb;
1126
1127       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1128       FOR_EACH_VEC_ELT (ldom, j, bb)
1129         if (!flow_bb_inside_loop_p (loop, bb))
1130           dom_bbs.safe_push (bb);
1131
1132       ldom.release ();
1133     }
1134   free (body);
1135
1136   if (!exit_at_end)
1137     {
1138       /* Leave exit in first copy (for explanation why see comment in
1139          unroll_loop_constant_iterations).  */
1140       may_exit_copy = 0;
1141       n_peel = max_unroll - 1;
1142       extra_zero_check = true;
1143       last_may_exit = false;
1144     }
1145   else
1146     {
1147       /* Leave exit in last copy (for explanation why see comment in
1148          unroll_loop_constant_iterations).  */
1149       may_exit_copy = max_unroll;
1150       n_peel = max_unroll;
1151       extra_zero_check = false;
1152       last_may_exit = true;
1153     }
1154
1155   /* Get expression for number of iterations.  */
1156   start_sequence ();
1157   old_niter = niter = gen_reg_rtx (desc->mode);
1158   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1159   if (tmp != niter)
1160     emit_move_insn (niter, tmp);
1161
1162   /* Count modulo by ANDing it with max_unroll; we use the fact that
1163      the number of unrollings is a power of two, and thus this is correct
1164      even if there is overflow in the computation.  */
1165   niter = expand_simple_binop (desc->mode, AND,
1166                                niter,
1167                                GEN_INT (max_unroll),
1168                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1169
1170   init_code = get_insns ();
1171   end_sequence ();
1172   unshare_all_rtl_in_chain (init_code);
1173
1174   /* Precondition the loop.  */
1175   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1176
1177   remove_edges.create (0);
1178
1179   wont_exit = sbitmap_alloc (max_unroll + 2);
1180
1181   /* Peel the first copy of loop body (almost always we must leave exit test
1182      here; the only exception is when we have extra zero check and the number
1183      of iterations is reliable.  Also record the place of (possible) extra
1184      zero check.  */
1185   bitmap_clear (wont_exit);
1186   if (extra_zero_check
1187       && !desc->noloop_assumptions)
1188     bitmap_set_bit (wont_exit, 1);
1189   ezc_swtch = loop_preheader_edge (loop)->src;
1190   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1191                                       1, wont_exit, desc->out_edge,
1192                                       &remove_edges,
1193                                       DLTHE_FLAG_UPDATE_FREQ);
1194   gcc_assert (ok);
1195
1196   /* Record the place where switch will be built for preconditioning.  */
1197   swtch = split_edge (loop_preheader_edge (loop));
1198
1199   for (i = 0; i < n_peel; i++)
1200     {
1201       /* Peel the copy.  */
1202       bitmap_clear (wont_exit);
1203       if (i != n_peel - 1 || !last_may_exit)
1204         bitmap_set_bit (wont_exit, 1);
1205       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1206                                           1, wont_exit, desc->out_edge,
1207                                           &remove_edges,
1208                                           DLTHE_FLAG_UPDATE_FREQ);
1209       gcc_assert (ok);
1210
1211       /* Create item for switch.  */
1212       j = n_peel - i - (extra_zero_check ? 0 : 1);
1213       p = REG_BR_PROB_BASE / (i + 2);
1214
1215       preheader = split_edge (loop_preheader_edge (loop));
1216       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1217                                           block_label (preheader), p,
1218                                           NULL_RTX);
1219
1220       /* We rely on the fact that the compare and jump cannot be optimized out,
1221          and hence the cfg we create is correct.  */
1222       gcc_assert (branch_code != NULL_RTX);
1223
1224       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1225       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1226       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1227       e = make_edge (swtch, preheader,
1228                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1229       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1230       e->probability = p;
1231     }
1232
1233   if (extra_zero_check)
1234     {
1235       /* Add branch for zero iterations.  */
1236       p = REG_BR_PROB_BASE / (max_unroll + 1);
1237       swtch = ezc_swtch;
1238       preheader = split_edge (loop_preheader_edge (loop));
1239       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1240                                           block_label (preheader), p,
1241                                           NULL_RTX);
1242       gcc_assert (branch_code != NULL_RTX);
1243
1244       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1245       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1246       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1247       e = make_edge (swtch, preheader,
1248                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1249       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1250       e->probability = p;
1251     }
1252
1253   /* Recount dominators for outer blocks.  */
1254   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1255
1256   /* And unroll loop.  */
1257
1258   bitmap_ones (wont_exit);
1259   bitmap_clear_bit (wont_exit, may_exit_copy);
1260   opt_info_start_duplication (opt_info);
1261
1262   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1263                                       max_unroll,
1264                                       wont_exit, desc->out_edge,
1265                                       &remove_edges,
1266                                       DLTHE_FLAG_UPDATE_FREQ
1267                                       | (opt_info
1268                                          ? DLTHE_RECORD_COPY_NUMBER
1269                                            : 0));
1270   gcc_assert (ok);
1271
1272   if (opt_info)
1273     {
1274       apply_opt_in_copies (opt_info, max_unroll, true, true);
1275       free_opt_info (opt_info);
1276     }
1277
1278   free (wont_exit);
1279
1280   if (exit_at_end)
1281     {
1282       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1283       /* Find a new in and out edge; they are in the last copy we have
1284          made.  */
1285
1286       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1287         {
1288           desc->out_edge = EDGE_SUCC (exit_block, 0);
1289           desc->in_edge = EDGE_SUCC (exit_block, 1);
1290         }
1291       else
1292         {
1293           desc->out_edge = EDGE_SUCC (exit_block, 1);
1294           desc->in_edge = EDGE_SUCC (exit_block, 0);
1295         }
1296     }
1297
1298   /* Remove the edges.  */
1299   FOR_EACH_VEC_ELT (remove_edges, i, e)
1300     remove_path (e);
1301   remove_edges.release ();
1302
1303   /* We must be careful when updating the number of iterations due to
1304      preconditioning and the fact that the value must be valid at entry
1305      of the loop.  After passing through the above code, we see that
1306      the correct new number of iterations is this:  */
1307   gcc_assert (!desc->const_iter);
1308   desc->niter_expr =
1309     simplify_gen_binary (UDIV, desc->mode, old_niter,
1310                          GEN_INT (max_unroll + 1));
1311   loop->nb_iterations_upper_bound
1312     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1313                                                                    + 1),
1314                                             TRUNC_DIV_EXPR);
1315   if (loop->any_estimate)
1316     loop->nb_iterations_estimate
1317       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1318                                                                   + 1),
1319                                            TRUNC_DIV_EXPR);
1320   if (exit_at_end)
1321     {
1322       desc->niter_expr =
1323         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1324       desc->noloop_assumptions = NULL_RTX;
1325       --loop->nb_iterations_upper_bound;
1326       if (loop->any_estimate
1327           && loop->nb_iterations_estimate != double_int_zero)
1328         --loop->nb_iterations_estimate;
1329       else
1330         loop->any_estimate = false;
1331     }
1332
1333   if (dump_file)
1334     fprintf (dump_file,
1335              ";; Unrolled loop %d times, counting # of iterations "
1336              "in runtime, %i insns\n",
1337              max_unroll, num_loop_insns (loop));
1338
1339   dom_bbs.release ();
1340 }
1341
1342 /* Decide whether to simply peel LOOP and how much.  */
1343 static void
1344 decide_peel_simple (struct loop *loop, int flags)
1345 {
1346   unsigned npeel;
1347   double_int iterations;
1348
1349   if (!(flags & UAP_PEEL))
1350     {
1351       /* We were not asked to, just return back silently.  */
1352       return;
1353     }
1354
1355   if (dump_file)
1356     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1357
1358   /* npeel = number of iterations to peel.  */
1359   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1360   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1361     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1362
1363   /* Skip big loops.  */
1364   if (!npeel)
1365     {
1366       if (dump_file)
1367         fprintf (dump_file, ";; Not considering loop, is too big\n");
1368       return;
1369     }
1370
1371   /* Do not simply peel loops with branches inside -- it increases number
1372      of mispredicts.
1373      Exception is when we do have profile and we however have good chance
1374      to peel proper number of iterations loop will iterate in practice.
1375      TODO: this heuristic needs tunning; while for complette unrolling
1376      the branch inside loop mostly eliminates any improvements, for
1377      peeling it is not the case.  Also a function call inside loop is
1378      also branch from branch prediction POV (and probably better reason
1379      to not unroll/peel).  */
1380   if (num_loop_branches (loop) > 1
1381       && profile_status != PROFILE_READ)
1382     {
1383       if (dump_file)
1384         fprintf (dump_file, ";; Not peeling, contains branches\n");
1385       return;
1386     }
1387
1388   /* If we have realistic estimate on number of iterations, use it.  */
1389   if (estimated_loop_iterations (loop, &iterations))
1390     {
1391       if (double_int::from_shwi (npeel).ule (iterations))
1392         {
1393           if (dump_file)
1394             {
1395               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1396               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1397                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1398               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1399                        npeel);
1400             }
1401           return;
1402         }
1403       npeel = iterations.to_shwi () + 1;
1404     }
1405   /* If we have small enough bound on iterations, we can still peel (completely
1406      unroll).  */
1407   else if (max_loop_iterations (loop, &iterations)
1408            && iterations.ult (double_int::from_shwi (npeel)))
1409     npeel = iterations.to_shwi () + 1;
1410   else
1411     {
1412       /* For now we have no good heuristics to decide whether loop peeling
1413          will be effective, so disable it.  */
1414       if (dump_file)
1415         fprintf (dump_file,
1416                  ";; Not peeling loop, no evidence it will be profitable\n");
1417       return;
1418     }
1419
1420   /* Success.  */
1421   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1422   loop->lpt_decision.times = npeel;
1423 }
1424
1425 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1426
1427    while (cond)
1428      body;
1429
1430    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1431
1432    if (!cond) goto end;
1433    body;
1434    if (!cond) goto end;
1435    body;
1436    if (!cond) goto end;
1437    body;
1438    while (cond)
1439      body;
1440    end: ;
1441    */
1442 static void
1443 peel_loop_simple (struct loop *loop)
1444 {
1445   sbitmap wont_exit;
1446   unsigned npeel = loop->lpt_decision.times;
1447   struct niter_desc *desc = get_simple_loop_desc (loop);
1448   struct opt_info *opt_info = NULL;
1449   bool ok;
1450
1451   if (flag_split_ivs_in_unroller && npeel > 1)
1452     opt_info = analyze_insns_in_loop (loop);
1453
1454   wont_exit = sbitmap_alloc (npeel + 1);
1455   bitmap_clear (wont_exit);
1456
1457   opt_info_start_duplication (opt_info);
1458
1459   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1460                                       npeel, wont_exit, NULL,
1461                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1462                                       | (opt_info
1463                                          ? DLTHE_RECORD_COPY_NUMBER
1464                                            : 0));
1465   gcc_assert (ok);
1466
1467   free (wont_exit);
1468
1469   if (opt_info)
1470     {
1471       apply_opt_in_copies (opt_info, npeel, false, false);
1472       free_opt_info (opt_info);
1473     }
1474
1475   if (desc->simple_p)
1476     {
1477       if (desc->const_iter)
1478         {
1479           desc->niter -= npeel;
1480           desc->niter_expr = GEN_INT (desc->niter);
1481           desc->noloop_assumptions = NULL_RTX;
1482         }
1483       else
1484         {
1485           /* We cannot just update niter_expr, as its value might be clobbered
1486              inside loop.  We could handle this by counting the number into
1487              temporary just like we do in runtime unrolling, but it does not
1488              seem worthwhile.  */
1489           free_simple_loop_desc (loop);
1490         }
1491     }
1492   if (dump_file)
1493     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1494 }
1495
1496 /* Decide whether to unroll LOOP stupidly and how much.  */
1497 static void
1498 decide_unroll_stupid (struct loop *loop, int flags)
1499 {
1500   unsigned nunroll, nunroll_by_av, i;
1501   struct niter_desc *desc;
1502   double_int iterations;
1503
1504   if (!(flags & UAP_UNROLL_ALL))
1505     {
1506       /* We were not asked to, just return back silently.  */
1507       return;
1508     }
1509
1510   if (dump_file)
1511     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1512
1513   /* nunroll = total number of copies of the original loop body in
1514      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1515   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1516   nunroll_by_av
1517     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1518   if (nunroll > nunroll_by_av)
1519     nunroll = nunroll_by_av;
1520   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1521     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1522
1523   if (targetm.loop_unroll_adjust)
1524     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1525
1526   /* Skip big loops.  */
1527   if (nunroll <= 1)
1528     {
1529       if (dump_file)
1530         fprintf (dump_file, ";; Not considering loop, is too big\n");
1531       return;
1532     }
1533
1534   /* Check for simple loops.  */
1535   desc = get_simple_loop_desc (loop);
1536
1537   /* Check simpleness.  */
1538   if (desc->simple_p && !desc->assumptions)
1539     {
1540       if (dump_file)
1541         fprintf (dump_file, ";; The loop is simple\n");
1542       return;
1543     }
1544
1545   /* Do not unroll loops with branches inside -- it increases number
1546      of mispredicts.
1547      TODO: this heuristic needs tunning; call inside the loop body
1548      is also relatively good reason to not unroll.  */
1549   if (num_loop_branches (loop) > 1)
1550     {
1551       if (dump_file)
1552         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1553       return;
1554     }
1555
1556   /* Check whether the loop rolls.  */
1557   if ((estimated_loop_iterations (loop, &iterations)
1558        || max_loop_iterations (loop, &iterations))
1559       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1560     {
1561       if (dump_file)
1562         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1563       return;
1564     }
1565
1566   /* Success.  Now force nunroll to be power of 2, as it seems that this
1567      improves results (partially because of better alignments, partially
1568      because of some dark magic).  */
1569   for (i = 1; 2 * i <= nunroll; i *= 2)
1570     continue;
1571
1572   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1573   loop->lpt_decision.times = i - 1;
1574 }
1575
1576 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1577
1578    while (cond)
1579      body;
1580
1581    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1582
1583    while (cond)
1584      {
1585        body;
1586        if (!cond) break;
1587        body;
1588        if (!cond) break;
1589        body;
1590        if (!cond) break;
1591        body;
1592      }
1593    */
1594 static void
1595 unroll_loop_stupid (struct loop *loop)
1596 {
1597   sbitmap wont_exit;
1598   unsigned nunroll = loop->lpt_decision.times;
1599   struct niter_desc *desc = get_simple_loop_desc (loop);
1600   struct opt_info *opt_info = NULL;
1601   bool ok;
1602
1603   if (flag_split_ivs_in_unroller
1604       || flag_variable_expansion_in_unroller)
1605     opt_info = analyze_insns_in_loop (loop);
1606
1607
1608   wont_exit = sbitmap_alloc (nunroll + 1);
1609   bitmap_clear (wont_exit);
1610   opt_info_start_duplication (opt_info);
1611
1612   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1613                                       nunroll, wont_exit,
1614                                       NULL, NULL,
1615                                       DLTHE_FLAG_UPDATE_FREQ
1616                                       | (opt_info
1617                                          ? DLTHE_RECORD_COPY_NUMBER
1618                                            : 0));
1619   gcc_assert (ok);
1620
1621   if (opt_info)
1622     {
1623       apply_opt_in_copies (opt_info, nunroll, true, true);
1624       free_opt_info (opt_info);
1625     }
1626
1627   free (wont_exit);
1628
1629   if (desc->simple_p)
1630     {
1631       /* We indeed may get here provided that there are nontrivial assumptions
1632          for a loop to be really simple.  We could update the counts, but the
1633          problem is that we are unable to decide which exit will be taken
1634          (not really true in case the number of iterations is constant,
1635          but noone will do anything with this information, so we do not
1636          worry about it).  */
1637       desc->simple_p = false;
1638     }
1639
1640   if (dump_file)
1641     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1642              nunroll, num_loop_insns (loop));
1643 }
1644
1645 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1646    Set *DEBUG_USES to the number of debug insns that reference the
1647    variable.  */
1648
1649 bool
1650 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1651                                   int *debug_uses)
1652 {
1653   basic_block *body, bb;
1654   unsigned i;
1655   int count_ref = 0;
1656   rtx insn;
1657
1658   body = get_loop_body (loop);
1659   for (i = 0; i < loop->num_nodes; i++)
1660     {
1661       bb = body[i];
1662
1663       FOR_BB_INSNS (bb, insn)
1664         if (!rtx_referenced_p (reg, insn))
1665           continue;
1666         else if (DEBUG_INSN_P (insn))
1667           ++*debug_uses;
1668         else if (++count_ref > 1)
1669           break;
1670     }
1671   free (body);
1672   return (count_ref  == 1);
1673 }
1674
1675 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1676
1677 static void
1678 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1679 {
1680   basic_block *body, bb;
1681   unsigned i;
1682   rtx insn;
1683
1684   body = get_loop_body (loop);
1685   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1686     {
1687       bb = body[i];
1688
1689       FOR_BB_INSNS (bb, insn)
1690         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1691           continue;
1692         else
1693           {
1694             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1695                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1696             if (!--debug_uses)
1697               break;
1698           }
1699     }
1700   free (body);
1701 }
1702
1703 /* Determine whether INSN contains an accumulator
1704    which can be expanded into separate copies,
1705    one for each copy of the LOOP body.
1706
1707    for (i = 0 ; i < n; i++)
1708      sum += a[i];
1709
1710    ==>
1711
1712    sum += a[i]
1713    ....
1714    i = i+1;
1715    sum1 += a[i]
1716    ....
1717    i = i+1
1718    sum2 += a[i];
1719    ....
1720
1721    Return NULL if INSN contains no opportunity for expansion of accumulator.
1722    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1723    information and return a pointer to it.
1724 */
1725
1726 static struct var_to_expand *
1727 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1728 {
1729   rtx set, dest, src;
1730   struct var_to_expand *ves;
1731   unsigned accum_pos;
1732   enum rtx_code code;
1733   int debug_uses = 0;
1734
1735   set = single_set (insn);
1736   if (!set)
1737     return NULL;
1738
1739   dest = SET_DEST (set);
1740   src = SET_SRC (set);
1741   code = GET_CODE (src);
1742
1743   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1744     return NULL;
1745
1746   if (FLOAT_MODE_P (GET_MODE (dest)))
1747     {
1748       if (!flag_associative_math)
1749         return NULL;
1750       /* In the case of FMA, we're also changing the rounding.  */
1751       if (code == FMA && !flag_unsafe_math_optimizations)
1752         return NULL;
1753     }
1754
1755   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1756      in MD.  But if there is no optab to generate the insn, we can not
1757      perform the variable expansion.  This can happen if an MD provides
1758      an insn but not a named pattern to generate it, for example to avoid
1759      producing code that needs additional mode switches like for x87/mmx.
1760
1761      So we check have_insn_for which looks for an optab for the operation
1762      in SRC.  If it doesn't exist, we can't perform the expansion even
1763      though INSN is valid.  */
1764   if (!have_insn_for (code, GET_MODE (src)))
1765     return NULL;
1766
1767   if (!REG_P (dest)
1768       && !(GET_CODE (dest) == SUBREG
1769            && REG_P (SUBREG_REG (dest))))
1770     return NULL;
1771
1772   /* Find the accumulator use within the operation.  */
1773   if (code == FMA)
1774     {
1775       /* We only support accumulation via FMA in the ADD position.  */
1776       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1777         return NULL;
1778       accum_pos = 2;
1779     }
1780   else if (rtx_equal_p (dest, XEXP (src, 0)))
1781     accum_pos = 0;
1782   else if (rtx_equal_p (dest, XEXP (src, 1)))
1783     {
1784       /* The method of expansion that we are using; which includes the
1785          initialization of the expansions with zero and the summation of
1786          the expansions at the end of the computation will yield wrong
1787          results for (x = something - x) thus avoid using it in that case.  */
1788       if (code == MINUS)
1789         return NULL;
1790       accum_pos = 1;
1791     }
1792   else
1793     return NULL;
1794
1795   /* It must not otherwise be used.  */
1796   if (code == FMA)
1797     {
1798       if (rtx_referenced_p (dest, XEXP (src, 0))
1799           || rtx_referenced_p (dest, XEXP (src, 1)))
1800         return NULL;
1801     }
1802   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1803     return NULL;
1804
1805   /* It must be used in exactly one insn.  */
1806   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1807     return NULL;
1808
1809   if (dump_file)
1810     {
1811       fprintf (dump_file, "\n;; Expanding Accumulator ");
1812       print_rtl (dump_file, dest);
1813       fprintf (dump_file, "\n");
1814     }
1815
1816   if (debug_uses)
1817     /* Instead of resetting the debug insns, we could replace each
1818        debug use in the loop with the sum or product of all expanded
1819        accummulators.  Since we'll only know of all expansions at the
1820        end, we'd have to keep track of which vars_to_expand a debug
1821        insn in the loop references, take note of each copy of the
1822        debug insn during unrolling, and when it's all done, compute
1823        the sum or product of each variable and adjust the original
1824        debug insn and each copy thereof.  What a pain!  */
1825     reset_debug_uses_in_loop (loop, dest, debug_uses);
1826
1827   /* Record the accumulator to expand.  */
1828   ves = XNEW (struct var_to_expand);
1829   ves->insn = insn;
1830   ves->reg = copy_rtx (dest);
1831   ves->var_expansions.create (1);
1832   ves->next = NULL;
1833   ves->op = GET_CODE (src);
1834   ves->expansion_count = 0;
1835   ves->reuse_expansion = 0;
1836   return ves;
1837 }
1838
1839 /* Determine whether there is an induction variable in INSN that
1840    we would like to split during unrolling.
1841
1842    I.e. replace
1843
1844    i = i + 1;
1845    ...
1846    i = i + 1;
1847    ...
1848    i = i + 1;
1849    ...
1850
1851    type chains by
1852
1853    i0 = i + 1
1854    ...
1855    i = i0 + 1
1856    ...
1857    i = i0 + 2
1858    ...
1859
1860    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1861    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1862    pointer to it.  */
1863
1864 static struct iv_to_split *
1865 analyze_iv_to_split_insn (rtx insn)
1866 {
1867   rtx set, dest;
1868   struct rtx_iv iv;
1869   struct iv_to_split *ivts;
1870   bool ok;
1871
1872   /* For now we just split the basic induction variables.  Later this may be
1873      extended for example by selecting also addresses of memory references.  */
1874   set = single_set (insn);
1875   if (!set)
1876     return NULL;
1877
1878   dest = SET_DEST (set);
1879   if (!REG_P (dest))
1880     return NULL;
1881
1882   if (!biv_p (insn, dest))
1883     return NULL;
1884
1885   ok = iv_analyze_result (insn, dest, &iv);
1886
1887   /* This used to be an assert under the assumption that if biv_p returns
1888      true that iv_analyze_result must also return true.  However, that
1889      assumption is not strictly correct as evidenced by pr25569.
1890
1891      Returning NULL when iv_analyze_result returns false is safe and
1892      avoids the problems in pr25569 until the iv_analyze_* routines
1893      can be fixed, which is apparently hard and time consuming
1894      according to their author.  */
1895   if (! ok)
1896     return NULL;
1897
1898   if (iv.step == const0_rtx
1899       || iv.mode != iv.extend_mode)
1900     return NULL;
1901
1902   /* Record the insn to split.  */
1903   ivts = XNEW (struct iv_to_split);
1904   ivts->insn = insn;
1905   ivts->orig_var = dest;
1906   ivts->base_var = NULL_RTX;
1907   ivts->step = iv.step;
1908   ivts->next = NULL;
1909   ivts->n_loc = 1;
1910   ivts->loc[0] = 1;
1911
1912   return ivts;
1913 }
1914
1915 /* Determines which of insns in LOOP can be optimized.
1916    Return a OPT_INFO struct with the relevant hash tables filled
1917    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1918    is undefined for the return value.  */
1919
1920 static struct opt_info *
1921 analyze_insns_in_loop (struct loop *loop)
1922 {
1923   basic_block *body, bb;
1924   unsigned i;
1925   struct opt_info *opt_info = XCNEW (struct opt_info);
1926   rtx insn;
1927   struct iv_to_split *ivts = NULL;
1928   struct var_to_expand *ves = NULL;
1929   iv_to_split **slot1;
1930   var_to_expand **slot2;
1931   vec<edge> edges = get_loop_exit_edges (loop);
1932   edge exit;
1933   bool can_apply = false;
1934
1935   iv_analysis_loop_init (loop);
1936
1937   body = get_loop_body (loop);
1938
1939   if (flag_split_ivs_in_unroller)
1940     {
1941       opt_info->insns_to_split.create (5 * loop->num_nodes);
1942       opt_info->iv_to_split_head = NULL;
1943       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1944     }
1945
1946   /* Record the loop exit bb and loop preheader before the unrolling.  */
1947   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1948
1949   if (edges.length () == 1)
1950     {
1951       exit = edges[0];
1952       if (!(exit->flags & EDGE_COMPLEX))
1953         {
1954           opt_info->loop_exit = split_edge (exit);
1955           can_apply = true;
1956         }
1957     }
1958
1959   if (flag_variable_expansion_in_unroller
1960       && can_apply)
1961     {
1962       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1963       opt_info->var_to_expand_head = NULL;
1964       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1965     }
1966
1967   for (i = 0; i < loop->num_nodes; i++)
1968     {
1969       bb = body[i];
1970       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1971         continue;
1972
1973       FOR_BB_INSNS (bb, insn)
1974       {
1975         if (!INSN_P (insn))
1976           continue;
1977
1978         if (opt_info->insns_to_split.is_created ())
1979           ivts = analyze_iv_to_split_insn (insn);
1980
1981         if (ivts)
1982           {
1983             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1984             gcc_assert (*slot1 == NULL);
1985             *slot1 = ivts;
1986             *opt_info->iv_to_split_tail = ivts;
1987             opt_info->iv_to_split_tail = &ivts->next;
1988             continue;
1989           }
1990
1991         if (opt_info->insns_with_var_to_expand.is_created ())
1992           ves = analyze_insn_to_expand_var (loop, insn);
1993
1994         if (ves)
1995           {
1996             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1997             gcc_assert (*slot2 == NULL);
1998             *slot2 = ves;
1999             *opt_info->var_to_expand_tail = ves;
2000             opt_info->var_to_expand_tail = &ves->next;
2001           }
2002       }
2003     }
2004
2005   edges.release ();
2006   free (body);
2007   return opt_info;
2008 }
2009
2010 /* Called just before loop duplication.  Records start of duplicated area
2011    to OPT_INFO.  */
2012
2013 static void
2014 opt_info_start_duplication (struct opt_info *opt_info)
2015 {
2016   if (opt_info)
2017     opt_info->first_new_block = last_basic_block;
2018 }
2019
2020 /* Determine the number of iterations between initialization of the base
2021    variable and the current copy (N_COPY).  N_COPIES is the total number
2022    of newly created copies.  UNROLLING is true if we are unrolling
2023    (not peeling) the loop.  */
2024
2025 static unsigned
2026 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2027 {
2028   if (unrolling)
2029     {
2030       /* If we are unrolling, initialization is done in the original loop
2031          body (number 0).  */
2032       return n_copy;
2033     }
2034   else
2035     {
2036       /* If we are peeling, the copy in that the initialization occurs has
2037          number 1.  The original loop (number 0) is the last.  */
2038       if (n_copy)
2039         return n_copy - 1;
2040       else
2041         return n_copies;
2042     }
2043 }
2044
2045 /* Locate in EXPR the expression corresponding to the location recorded
2046    in IVTS, and return a pointer to the RTX for this location.  */
2047
2048 static rtx *
2049 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2050 {
2051   unsigned i;
2052   rtx *ret = &expr;
2053
2054   for (i = 0; i < ivts->n_loc; i++)
2055     ret = &XEXP (*ret, ivts->loc[i]);
2056
2057   return ret;
2058 }
2059
2060 /* Allocate basic variable for the induction variable chain.  */
2061
2062 static void
2063 allocate_basic_variable (struct iv_to_split *ivts)
2064 {
2065   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2066
2067   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2068 }
2069
2070 /* Insert initialization of basic variable of IVTS before INSN, taking
2071    the initial value from INSN.  */
2072
2073 static void
2074 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2075 {
2076   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2077   rtx seq;
2078
2079   start_sequence ();
2080   expr = force_operand (expr, ivts->base_var);
2081   if (expr != ivts->base_var)
2082     emit_move_insn (ivts->base_var, expr);
2083   seq = get_insns ();
2084   end_sequence ();
2085
2086   emit_insn_before (seq, insn);
2087 }
2088
2089 /* Replace the use of induction variable described in IVTS in INSN
2090    by base variable + DELTA * step.  */
2091
2092 static void
2093 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2094 {
2095   rtx expr, *loc, seq, incr, var;
2096   enum machine_mode mode = GET_MODE (ivts->base_var);
2097   rtx src, dest, set;
2098
2099   /* Construct base + DELTA * step.  */
2100   if (!delta)
2101     expr = ivts->base_var;
2102   else
2103     {
2104       incr = simplify_gen_binary (MULT, mode,
2105                                   ivts->step, gen_int_mode (delta, mode));
2106       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2107                                   ivts->base_var, incr);
2108     }
2109
2110   /* Figure out where to do the replacement.  */
2111   loc = get_ivts_expr (single_set (insn), ivts);
2112
2113   /* If we can make the replacement right away, we're done.  */
2114   if (validate_change (insn, loc, expr, 0))
2115     return;
2116
2117   /* Otherwise, force EXPR into a register and try again.  */
2118   start_sequence ();
2119   var = gen_reg_rtx (mode);
2120   expr = force_operand (expr, var);
2121   if (expr != var)
2122     emit_move_insn (var, expr);
2123   seq = get_insns ();
2124   end_sequence ();
2125   emit_insn_before (seq, insn);
2126
2127   if (validate_change (insn, loc, var, 0))
2128     return;
2129
2130   /* The last chance.  Try recreating the assignment in insn
2131      completely from scratch.  */
2132   set = single_set (insn);
2133   gcc_assert (set);
2134
2135   start_sequence ();
2136   *loc = var;
2137   src = copy_rtx (SET_SRC (set));
2138   dest = copy_rtx (SET_DEST (set));
2139   src = force_operand (src, dest);
2140   if (src != dest)
2141     emit_move_insn (dest, src);
2142   seq = get_insns ();
2143   end_sequence ();
2144
2145   emit_insn_before (seq, insn);
2146   delete_insn (insn);
2147 }
2148
2149
2150 /* Return one expansion of the accumulator recorded in struct VE.  */
2151
2152 static rtx
2153 get_expansion (struct var_to_expand *ve)
2154 {
2155   rtx reg;
2156
2157   if (ve->reuse_expansion == 0)
2158     reg = ve->reg;
2159   else
2160     reg = ve->var_expansions[ve->reuse_expansion - 1];
2161
2162   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2163     ve->reuse_expansion = 0;
2164   else
2165     ve->reuse_expansion++;
2166
2167   return reg;
2168 }
2169
2170
2171 /* Given INSN replace the uses of the accumulator recorded in VE
2172    with a new register.  */
2173
2174 static void
2175 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2176 {
2177   rtx new_reg, set;
2178   bool really_new_expansion = false;
2179
2180   set = single_set (insn);
2181   gcc_assert (set);
2182
2183   /* Generate a new register only if the expansion limit has not been
2184      reached.  Else reuse an already existing expansion.  */
2185   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2186     {
2187       really_new_expansion = true;
2188       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2189     }
2190   else
2191     new_reg = get_expansion (ve);
2192
2193   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2194   if (apply_change_group ())
2195     if (really_new_expansion)
2196       {
2197         ve->var_expansions.safe_push (new_reg);
2198         ve->expansion_count++;
2199       }
2200 }
2201
2202 /* Initialize the variable expansions in loop preheader.  PLACE is the
2203    loop-preheader basic block where the initialization of the
2204    expansions should take place.  The expansions are initialized with
2205    (-0) when the operation is plus or minus to honor sign zero.  This
2206    way we can prevent cases where the sign of the final result is
2207    effected by the sign of the expansion.  Here is an example to
2208    demonstrate this:
2209
2210    for (i = 0 ; i < n; i++)
2211      sum += something;
2212
2213    ==>
2214
2215    sum += something
2216    ....
2217    i = i+1;
2218    sum1 += something
2219    ....
2220    i = i+1
2221    sum2 += something;
2222    ....
2223
2224    When SUM is initialized with -zero and SOMETHING is also -zero; the
2225    final result of sum should be -zero thus the expansions sum1 and sum2
2226    should be initialized with -zero as well (otherwise we will get +zero
2227    as the final result).  */
2228
2229 static void
2230 insert_var_expansion_initialization (struct var_to_expand *ve,
2231                                      basic_block place)
2232 {
2233   rtx seq, var, zero_init;
2234   unsigned i;
2235   enum machine_mode mode = GET_MODE (ve->reg);
2236   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2237
2238   if (ve->var_expansions.length () == 0)
2239     return;
2240
2241   start_sequence ();
2242   switch (ve->op)
2243     {
2244     case FMA:
2245       /* Note that we only accumulate FMA via the ADD operand.  */
2246     case PLUS:
2247     case MINUS:
2248       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2249         {
2250           if (honor_signed_zero_p)
2251             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2252           else
2253             zero_init = CONST0_RTX (mode);
2254           emit_move_insn (var, zero_init);
2255         }
2256       break;
2257
2258     case MULT:
2259       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2260         {
2261           zero_init = CONST1_RTX (GET_MODE (var));
2262           emit_move_insn (var, zero_init);
2263         }
2264       break;
2265
2266     default:
2267       gcc_unreachable ();
2268     }
2269
2270   seq = get_insns ();
2271   end_sequence ();
2272
2273   emit_insn_after (seq, BB_END (place));
2274 }
2275
2276 /* Combine the variable expansions at the loop exit.  PLACE is the
2277    loop exit basic block where the summation of the expansions should
2278    take place.  */
2279
2280 static void
2281 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2282 {
2283   rtx sum = ve->reg;
2284   rtx expr, seq, var, insn;
2285   unsigned i;
2286
2287   if (ve->var_expansions.length () == 0)
2288     return;
2289
2290   start_sequence ();
2291   switch (ve->op)
2292     {
2293     case FMA:
2294       /* Note that we only accumulate FMA via the ADD operand.  */
2295     case PLUS:
2296     case MINUS:
2297       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2298         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2299       break;
2300
2301     case MULT:
2302       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2303         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2304       break;
2305
2306     default:
2307       gcc_unreachable ();
2308     }
2309
2310   expr = force_operand (sum, ve->reg);
2311   if (expr != ve->reg)
2312     emit_move_insn (ve->reg, expr);
2313   seq = get_insns ();
2314   end_sequence ();
2315
2316   insn = BB_HEAD (place);
2317   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2318     insn = NEXT_INSN (insn);
2319
2320   emit_insn_after (seq, insn);
2321 }
2322
2323 /* Strip away REG_EQUAL notes for IVs we're splitting.
2324
2325    Updating REG_EQUAL notes for IVs we split is tricky: We
2326    cannot tell until after unrolling, DF-rescanning, and liveness
2327    updating, whether an EQ_USE is reached by the split IV while
2328    the IV reg is still live.  See PR55006.
2329
2330    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2331    because RTL loop-iv requires us to defer rescanning insns and
2332    any notes attached to them.  So resort to old techniques...  */
2333
2334 static void
2335 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2336 {
2337   struct iv_to_split *ivts;
2338   rtx note = find_reg_equal_equiv_note (insn);
2339   if (! note)
2340     return;
2341   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2342     if (reg_mentioned_p (ivts->orig_var, note))
2343       {
2344         remove_note (insn, note);
2345         return;
2346       }
2347 }
2348
2349 /* Apply loop optimizations in loop copies using the
2350    data which gathered during the unrolling.  Structure
2351    OPT_INFO record that data.
2352
2353    UNROLLING is true if we unrolled (not peeled) the loop.
2354    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2355    the loop (as it should happen in complete unrolling, but not in ordinary
2356    peeling of the loop).  */
2357
2358 static void
2359 apply_opt_in_copies (struct opt_info *opt_info,
2360                      unsigned n_copies, bool unrolling,
2361                      bool rewrite_original_loop)
2362 {
2363   unsigned i, delta;
2364   basic_block bb, orig_bb;
2365   rtx insn, orig_insn, next;
2366   struct iv_to_split ivts_templ, *ivts;
2367   struct var_to_expand ve_templ, *ves;
2368
2369   /* Sanity check -- we need to put initialization in the original loop
2370      body.  */
2371   gcc_assert (!unrolling || rewrite_original_loop);
2372
2373   /* Allocate the basic variables (i0).  */
2374   if (opt_info->insns_to_split.is_created ())
2375     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2376       allocate_basic_variable (ivts);
2377
2378   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2379     {
2380       bb = BASIC_BLOCK (i);
2381       orig_bb = get_bb_original (bb);
2382
2383       /* bb->aux holds position in copy sequence initialized by
2384          duplicate_loop_to_header_edge.  */
2385       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2386                                         unrolling);
2387       bb->aux = 0;
2388       orig_insn = BB_HEAD (orig_bb);
2389       FOR_BB_INSNS_SAFE (bb, insn, next)
2390         {
2391           if (!INSN_P (insn)
2392               || (DEBUG_INSN_P (insn)
2393                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2394             continue;
2395
2396           while (!INSN_P (orig_insn)
2397                  || (DEBUG_INSN_P (orig_insn)
2398                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2399                          == LABEL_DECL)))
2400             orig_insn = NEXT_INSN (orig_insn);
2401
2402           ivts_templ.insn = orig_insn;
2403           ve_templ.insn = orig_insn;
2404
2405           /* Apply splitting iv optimization.  */
2406           if (opt_info->insns_to_split.is_created ())
2407             {
2408               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2409
2410               ivts = opt_info->insns_to_split.find (&ivts_templ);
2411
2412               if (ivts)
2413                 {
2414                   gcc_assert (GET_CODE (PATTERN (insn))
2415                               == GET_CODE (PATTERN (orig_insn)));
2416
2417                   if (!delta)
2418                     insert_base_initialization (ivts, insn);
2419                   split_iv (ivts, insn, delta);
2420                 }
2421             }
2422           /* Apply variable expansion optimization.  */
2423           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2424             {
2425               ves = (struct var_to_expand *)
2426                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2427               if (ves)
2428                 {
2429                   gcc_assert (GET_CODE (PATTERN (insn))
2430                               == GET_CODE (PATTERN (orig_insn)));
2431                   expand_var_during_unrolling (ves, insn);
2432                 }
2433             }
2434           orig_insn = NEXT_INSN (orig_insn);
2435         }
2436     }
2437
2438   if (!rewrite_original_loop)
2439     return;
2440
2441   /* Initialize the variable expansions in the loop preheader
2442      and take care of combining them at the loop exit.  */
2443   if (opt_info->insns_with_var_to_expand.is_created ())
2444     {
2445       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2446         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2447       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2448         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2449     }
2450
2451   /* Rewrite also the original loop body.  Find them as originals of the blocks
2452      in the last copied iteration, i.e. those that have
2453      get_bb_copy (get_bb_original (bb)) == bb.  */
2454   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2455     {
2456       bb = BASIC_BLOCK (i);
2457       orig_bb = get_bb_original (bb);
2458       if (get_bb_copy (orig_bb) != bb)
2459         continue;
2460
2461       delta = determine_split_iv_delta (0, n_copies, unrolling);
2462       for (orig_insn = BB_HEAD (orig_bb);
2463            orig_insn != NEXT_INSN (BB_END (bb));
2464            orig_insn = next)
2465         {
2466           next = NEXT_INSN (orig_insn);
2467
2468           if (!INSN_P (orig_insn))
2469             continue;
2470
2471           ivts_templ.insn = orig_insn;
2472           if (opt_info->insns_to_split.is_created ())
2473             {
2474               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2475
2476               ivts = (struct iv_to_split *)
2477                 opt_info->insns_to_split.find (&ivts_templ);
2478               if (ivts)
2479                 {
2480                   if (!delta)
2481                     insert_base_initialization (ivts, orig_insn);
2482                   split_iv (ivts, orig_insn, delta);
2483                   continue;
2484                 }
2485             }
2486
2487         }
2488     }
2489 }
2490
2491 /* Release OPT_INFO.  */
2492
2493 static void
2494 free_opt_info (struct opt_info *opt_info)
2495 {
2496   if (opt_info->insns_to_split.is_created ())
2497     opt_info->insns_to_split.dispose ();
2498   if (opt_info->insns_with_var_to_expand.is_created ())
2499     {
2500       struct var_to_expand *ves;
2501
2502       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2503         ves->var_expansions.release ();
2504       opt_info->insns_with_var_to_expand.dispose ();
2505     }
2506   free (opt_info);
2507 }