gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hash-table.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Hashtable helper for iv_to_split.  */
 106
 107 struct iv_split_hasher : typed_free_remove <iv_to_split>
 108 {
 109   typedef iv_to_split value_type;
 110   typedef iv_to_split compare_type;
 111   static inline hashval_t hash (const value_type *);
 112   static inline bool equal (const value_type *, const compare_type *);
 113 };
 114
 115
 116 /* A hash function for information about insns to split.  */
 117
 118 inline hashval_t
 119 iv_split_hasher::hash (const value_type *ivts)
 120 {
 121   return (hashval_t) INSN_UID (ivts->insn);
 122 }
 123
 124 /* An equality functions for information about insns to split.  */
 125
 126 inline bool
 127 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 128 {
 129   return i1->insn == i2->insn;
 130 }
 131
 132 /* Hashtable helper for iv_to_split.  */
 133
 134 struct var_expand_hasher : typed_free_remove <var_to_expand>
 135 {
 136   typedef var_to_expand value_type;
 137   typedef var_to_expand compare_type;
 138   static inline hashval_t hash (const value_type *);
 139   static inline bool equal (const value_type *, const compare_type *);
 140 };
 141
 142 /* Return a hash for VES.  */
 143
 144 inline hashval_t
 145 var_expand_hasher::hash (const value_type *ves)
 146 {
 147   return (hashval_t) INSN_UID (ves->insn);
 148 }
 149
 150 /* Return true if I1 and I2 refer to the same instruction.  */
 151
 152 inline bool
 153 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 154 {
 155   return i1->insn == i2->insn;
 156 }
 157
 158 /* Information about optimization applied in
 159    the unrolled loop.  */
 160
 161 struct opt_info
 162 {
 163   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 164                                                   split.  */
 165   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 166   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 167   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 168                                         insns with accumulators to expand.  */
 169   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 170   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 171   unsigned first_new_block;        /* The first basic block that was
 172                                       duplicated.  */
 173   basic_block loop_exit;           /* The loop exit basic block.  */
 174   basic_block loop_preheader;      /* The loop preheader basic block.  */
 175 };
 176
 177 static void decide_unrolling_and_peeling (int);
 178 static void peel_loops_completely (int);
 179 static void decide_peel_simple (struct loop *, int);
 180 static void decide_peel_once_rolling (struct loop *, int);
 181 static void decide_peel_completely (struct loop *, int);
 182 static void decide_unroll_stupid (struct loop *, int);
 183 static void decide_unroll_constant_iterations (struct loop *, int);
 184 static void decide_unroll_runtime_iterations (struct loop *, int);
 185 static void peel_loop_simple (struct loop *);
 186 static void peel_loop_completely (struct loop *);
 187 static void unroll_loop_stupid (struct loop *);
 188 static void unroll_loop_constant_iterations (struct loop *);
 189 static void unroll_loop_runtime_iterations (struct loop *);
 190 static struct opt_info *analyze_insns_in_loop (struct loop *);
 191 static void opt_info_start_duplication (struct opt_info *);
 192 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 193 static void free_opt_info (struct opt_info *);
 194 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 195 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 196 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 197 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 198 static void insert_var_expansion_initialization (struct var_to_expand *,
 199                                                  basic_block);
 200 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 201                                              basic_block);
 202 static rtx get_expansion (struct var_to_expand *);
 203
 204 /* Emit a message summarizing the unroll or peel that will be
 205    performed for LOOP, along with the loop's location LOCUS, if
 206    appropriate given the dump or -fopt-info settings.  */
 207
 208 static void
 209 report_unroll_peel (struct loop *loop, location_t locus)
 210 {
 211   struct niter_desc *desc;
 212   int niters = 0;
 213   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 214
 215   if (loop->lpt_decision.decision == LPT_NONE)
 216     return;
 217
 218   if (!dump_enabled_p ())
 219     return;
 220
 221   /* In the special case where the loop never iterated, emit
 222      a different message so that we don't report an unroll by 0.
 223      This matches the equivalent message emitted during tree unrolling.  */
 224   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 225       && !loop->lpt_decision.times)
 226     {
 227       dump_printf_loc (report_flags, locus,
 228                        "loop turned into non-loop; it never loops.\n");
 229       return;
 230     }
 231
 232   desc = get_simple_loop_desc (loop);
 233
 234   if (desc->const_iter)
 235     niters = desc->niter;
 236   else if (loop->header->count)
 237     niters = expected_loop_iterations (loop);
 238
 239   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 240     dump_printf_loc (report_flags, locus,
 241                      "loop with %d iterations completely unrolled",
 242                      loop->lpt_decision.times + 1);
 243   else
 244     dump_printf_loc (report_flags, locus,
 245                      "loop %s %d times",
 246                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 247                        ? "peeled" : "unrolled"),
 248                      loop->lpt_decision.times);
 249   if (profile_info)
 250     dump_printf (report_flags,
 251                  " (header execution count %d",
 252                  (int)loop->header->count);
 253   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254     dump_printf (report_flags,
 255                  "%s%s iterations %d)",
 256                  profile_info ? ", " : " (",
 257                  desc->const_iter ? "const" : "average",
 258                  niters);
 259   else if (profile_info)
 260     dump_printf (report_flags, ")");
 261
 262   dump_printf (report_flags, "\n");
 263 }
 264
 265 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 266 void
 267 unroll_and_peel_loops (int flags)
 268 {
 269   struct loop *loop;
 270   bool changed = false;
 271   loop_iterator li;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   loop_iterator li;
 348   bool changed = false;
 349
 350   /* Scan the loops, the inner ones first.  */
 351   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 352     {
 353       loop->lpt_decision.decision = LPT_NONE;
 354       location_t locus = get_loop_location (loop);
 355
 356       if (dump_enabled_p ())
 357         dump_printf_loc (TDF_RTL, locus,
 358                          ";; *** Considering loop %d at BB %d for "
 359                          "complete peeling ***\n",
 360                          loop->num, loop->header->index);
 361
 362       loop->ninsns = num_loop_insns (loop);
 363
 364       decide_peel_once_rolling (loop, flags);
 365       if (loop->lpt_decision.decision == LPT_NONE)
 366         decide_peel_completely (loop, flags);
 367
 368       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 369         {
 370           report_unroll_peel (loop, locus);
 371           peel_loop_completely (loop);
 372           changed = true;
 373         }
 374     }
 375
 376     if (changed)
 377       {
 378         calculate_dominance_info (CDI_DOMINATORS);
 379         fix_loop_structure (NULL);
 380       }
 381 }
 382
 383 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 384 static void
 385 decide_unrolling_and_peeling (int flags)
 386 {
 387   struct loop *loop;
 388   loop_iterator li;
 389
 390   /* Scan the loops, inner ones first.  */
 391   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 392     {
 393       loop->lpt_decision.decision = LPT_NONE;
 394       location_t locus = get_loop_location (loop);
 395
 396       if (dump_enabled_p ())
 397         dump_printf_loc (TDF_RTL, locus,
 398                          ";; *** Considering loop %d at BB %d for "
 399                          "unrolling and peeling ***\n",
 400                          loop->num, loop->header->index);
 401
 402       /* Do not peel cold areas.  */
 403       if (optimize_loop_for_size_p (loop))
 404         {
 405           if (dump_file)
 406             fprintf (dump_file, ";; Not considering loop, cold area\n");
 407           continue;
 408         }
 409
 410       /* Can the loop be manipulated?  */
 411       if (!can_duplicate_loop_p (loop))
 412         {
 413           if (dump_file)
 414             fprintf (dump_file,
 415                      ";; Not considering loop, cannot duplicate\n");
 416           continue;
 417         }
 418
 419       /* Skip non-innermost loops.  */
 420       if (loop->inner)
 421         {
 422           if (dump_file)
 423             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 424           continue;
 425         }
 426
 427       loop->ninsns = num_loop_insns (loop);
 428       loop->av_ninsns = average_num_loop_insns (loop);
 429
 430       /* Try transformations one by one in decreasing order of
 431          priority.  */
 432
 433       decide_unroll_constant_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_runtime_iterations (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_unroll_stupid (loop, flags);
 438       if (loop->lpt_decision.decision == LPT_NONE)
 439         decide_peel_simple (loop, flags);
 440
 441       report_unroll_peel (loop, locus);
 442     }
 443 }
 444
 445 /* Decide whether the LOOP is once rolling and suitable for complete
 446    peeling.  */
 447 static void
 448 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 449 {
 450   struct niter_desc *desc;
 451
 452   if (dump_file)
 453     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 454
 455   /* Is the loop small enough?  */
 456   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 457     {
 458       if (dump_file)
 459         fprintf (dump_file, ";; Not considering loop, is too big\n");
 460       return;
 461     }
 462
 463   /* Check for simple loops.  */
 464   desc = get_simple_loop_desc (loop);
 465
 466   /* Check number of iterations.  */
 467   if (!desc->simple_p
 468       || desc->assumptions
 469       || desc->infinite
 470       || !desc->const_iter
 471       || (desc->niter != 0
 472           && get_max_loop_iterations_int (loop) != 0))
 473     {
 474       if (dump_file)
 475         fprintf (dump_file,
 476                  ";; Unable to prove that the loop rolls exactly once\n");
 477       return;
 478     }
 479
 480   /* Success.  */
 481   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 482 }
 483
 484 /* Decide whether the LOOP is suitable for complete peeling.  */
 485 static void
 486 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 487 {
 488   unsigned npeel;
 489   struct niter_desc *desc;
 490
 491   if (dump_file)
 492     fprintf (dump_file, "\n;; Considering peeling completely\n");
 493
 494   /* Skip non-innermost loops.  */
 495   if (loop->inner)
 496     {
 497       if (dump_file)
 498         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 499       return;
 500     }
 501
 502   /* Do not peel cold areas.  */
 503   if (optimize_loop_for_size_p (loop))
 504     {
 505       if (dump_file)
 506         fprintf (dump_file, ";; Not considering loop, cold area\n");
 507       return;
 508     }
 509
 510   /* Can the loop be manipulated?  */
 511   if (!can_duplicate_loop_p (loop))
 512     {
 513       if (dump_file)
 514         fprintf (dump_file,
 515                  ";; Not considering loop, cannot duplicate\n");
 516       return;
 517     }
 518
 519   /* npeel = number of iterations to peel.  */
 520   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 521   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 522     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 523
 524   /* Is the loop small enough?  */
 525   if (!npeel)
 526     {
 527       if (dump_file)
 528         fprintf (dump_file, ";; Not considering loop, is too big\n");
 529       return;
 530     }
 531
 532   /* Check for simple loops.  */
 533   desc = get_simple_loop_desc (loop);
 534
 535   /* Check number of iterations.  */
 536   if (!desc->simple_p
 537       || desc->assumptions
 538       || !desc->const_iter
 539       || desc->infinite)
 540     {
 541       if (dump_file)
 542         fprintf (dump_file,
 543                  ";; Unable to prove that the loop iterates constant times\n");
 544       return;
 545     }
 546
 547   if (desc->niter > npeel - 1)
 548     {
 549       if (dump_file)
 550         {
 551           fprintf (dump_file,
 552                    ";; Not peeling loop completely, rolls too much (");
 553           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 554           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 555         }
 556       return;
 557     }
 558
 559   /* Success.  */
 560   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 561 }
 562
 563 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 564    completely.  The transformation done:
 565
 566    for (i = 0; i < 4; i++)
 567      body;
 568
 569    ==>
 570
 571    i = 0;
 572    body; i++;
 573    body; i++;
 574    body; i++;
 575    body; i++;
 576    */
 577 static void
 578 peel_loop_completely (struct loop *loop)
 579 {
 580   sbitmap wont_exit;
 581   unsigned HOST_WIDE_INT npeel;
 582   unsigned i;
 583   vec<edge> remove_edges;
 584   edge ein;
 585   struct niter_desc *desc = get_simple_loop_desc (loop);
 586   struct opt_info *opt_info = NULL;
 587
 588   npeel = desc->niter;
 589
 590   if (npeel)
 591     {
 592       bool ok;
 593
 594       wont_exit = sbitmap_alloc (npeel + 1);
 595       bitmap_ones (wont_exit);
 596       bitmap_clear_bit (wont_exit, 0);
 597       if (desc->noloop_assumptions)
 598         bitmap_clear_bit (wont_exit, 1);
 599
 600       remove_edges.create (0);
 601
 602       if (flag_split_ivs_in_unroller)
 603         opt_info = analyze_insns_in_loop (loop);
 604
 605       opt_info_start_duplication (opt_info);
 606       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 607                                           npeel,
 608                                           wont_exit, desc->out_edge,
 609                                           &remove_edges,
 610                                           DLTHE_FLAG_UPDATE_FREQ
 611                                           | DLTHE_FLAG_COMPLETTE_PEEL
 612                                           | (opt_info
 613                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 614       gcc_assert (ok);
 615
 616       free (wont_exit);
 617
 618       if (opt_info)
 619         {
 620           apply_opt_in_copies (opt_info, npeel, false, true);
 621           free_opt_info (opt_info);
 622         }
 623
 624       /* Remove the exit edges.  */
 625       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 626         remove_path (ein);
 627       remove_edges.release ();
 628     }
 629
 630   ein = desc->in_edge;
 631   free_simple_loop_desc (loop);
 632
 633   /* Now remove the unreachable part of the last iteration and cancel
 634      the loop.  */
 635   remove_path (ein);
 636
 637   if (dump_file)
 638     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 639 }
 640
 641 /* Decide whether to unroll LOOP iterating constant number of times
 642    and how much.  */
 643
 644 static void
 645 decide_unroll_constant_iterations (struct loop *loop, int flags)
 646 {
 647   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 648   struct niter_desc *desc;
 649   widest_int iterations;
 650
 651   if (!(flags & UAP_UNROLL))
 652     {
 653       /* We were not asked to, just return back silently.  */
 654       return;
 655     }
 656
 657   if (dump_file)
 658     fprintf (dump_file,
 659              "\n;; Considering unrolling loop with constant "
 660              "number of iterations\n");
 661
 662   /* nunroll = total number of copies of the original loop body in
 663      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 664   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 665   nunroll_by_av
 666     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 667   if (nunroll > nunroll_by_av)
 668     nunroll = nunroll_by_av;
 669   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 670     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 671
 672   /* Skip big loops.  */
 673   if (nunroll <= 1)
 674     {
 675       if (dump_file)
 676         fprintf (dump_file, ";; Not considering loop, is too big\n");
 677       return;
 678     }
 679
 680   /* Check for simple loops.  */
 681   desc = get_simple_loop_desc (loop);
 682
 683   /* Check number of iterations.  */
 684   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 685     {
 686       if (dump_file)
 687         fprintf (dump_file,
 688                  ";; Unable to prove that the loop iterates constant times\n");
 689       return;
 690     }
 691
 692   /* Check whether the loop rolls enough to consider.
 693      Consult also loop bounds and profile; in the case the loop has more
 694      than one exit it may well loop less than determined maximal number
 695      of iterations.  */
 696   if (desc->niter < 2 * nunroll
 697       || ((get_estimated_loop_iterations (loop, &iterations)
 698            || get_max_loop_iterations (loop, &iterations))
 699           && wi::ltu_p (iterations, 2 * nunroll)))
 700     {
 701       if (dump_file)
 702         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 703       return;
 704     }
 705
 706   /* Success; now compute number of iterations to unroll.  We alter
 707      nunroll so that as few as possible copies of loop body are
 708      necessary, while still not decreasing the number of unrollings
 709      too much (at most by 1).  */
 710   best_copies = 2 * nunroll + 10;
 711
 712   i = 2 * nunroll + 2;
 713   if (i - 1 >= desc->niter)
 714     i = desc->niter - 2;
 715
 716   for (; i >= nunroll - 1; i--)
 717     {
 718       unsigned exit_mod = desc->niter % (i + 1);
 719
 720       if (!loop_exit_at_end_p (loop))
 721         n_copies = exit_mod + i + 1;
 722       else if (exit_mod != (unsigned) i
 723                || desc->noloop_assumptions != NULL_RTX)
 724         n_copies = exit_mod + i + 2;
 725       else
 726         n_copies = i + 1;
 727
 728       if (n_copies < best_copies)
 729         {
 730           best_copies = n_copies;
 731           best_unroll = i;
 732         }
 733     }
 734
 735   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 736   loop->lpt_decision.times = best_unroll;
 737 }
 738
 739 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 740    The transformation does this:
 741
 742    for (i = 0; i < 102; i++)
 743      body;
 744
 745    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 746
 747    i = 0;
 748    body; i++;
 749    body; i++;
 750    while (i < 102)
 751      {
 752        body; i++;
 753        body; i++;
 754        body; i++;
 755        body; i++;
 756      }
 757   */
 758 static void
 759 unroll_loop_constant_iterations (struct loop *loop)
 760 {
 761   unsigned HOST_WIDE_INT niter;
 762   unsigned exit_mod;
 763   sbitmap wont_exit;
 764   unsigned i;
 765   vec<edge> remove_edges;
 766   edge e;
 767   unsigned max_unroll = loop->lpt_decision.times;
 768   struct niter_desc *desc = get_simple_loop_desc (loop);
 769   bool exit_at_end = loop_exit_at_end_p (loop);
 770   struct opt_info *opt_info = NULL;
 771   bool ok;
 772
 773   niter = desc->niter;
 774
 775   /* Should not get here (such loop should be peeled instead).  */
 776   gcc_assert (niter > max_unroll + 1);
 777
 778   exit_mod = niter % (max_unroll + 1);
 779
 780   wont_exit = sbitmap_alloc (max_unroll + 1);
 781   bitmap_ones (wont_exit);
 782
 783   remove_edges.create (0);
 784   if (flag_split_ivs_in_unroller
 785       || flag_variable_expansion_in_unroller)
 786     opt_info = analyze_insns_in_loop (loop);
 787
 788   if (!exit_at_end)
 789     {
 790       /* The exit is not at the end of the loop; leave exit test
 791          in the first copy, so that the loops that start with test
 792          of exit condition have continuous body after unrolling.  */
 793
 794       if (dump_file)
 795         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 796
 797       /* Peel exit_mod iterations.  */
 798       bitmap_clear_bit (wont_exit, 0);
 799       if (desc->noloop_assumptions)
 800         bitmap_clear_bit (wont_exit, 1);
 801
 802       if (exit_mod)
 803         {
 804           opt_info_start_duplication (opt_info);
 805           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 806                                               exit_mod,
 807                                               wont_exit, desc->out_edge,
 808                                               &remove_edges,
 809                                               DLTHE_FLAG_UPDATE_FREQ
 810                                               | (opt_info && exit_mod > 1
 811                                                  ? DLTHE_RECORD_COPY_NUMBER
 812                                                    : 0));
 813           gcc_assert (ok);
 814
 815           if (opt_info && exit_mod > 1)
 816             apply_opt_in_copies (opt_info, exit_mod, false, false);
 817
 818           desc->noloop_assumptions = NULL_RTX;
 819           desc->niter -= exit_mod;
 820           loop->nb_iterations_upper_bound -= exit_mod;
 821           if (loop->any_estimate
 822               && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
 823             loop->nb_iterations_estimate -= exit_mod;
 824           else
 825             loop->any_estimate = false;
 826         }
 827
 828       bitmap_set_bit (wont_exit, 1);
 829     }
 830   else
 831     {
 832       /* Leave exit test in last copy, for the same reason as above if
 833          the loop tests the condition at the end of loop body.  */
 834
 835       if (dump_file)
 836         fprintf (dump_file, ";; Condition at end of loop.\n");
 837
 838       /* We know that niter >= max_unroll + 2; so we do not need to care of
 839          case when we would exit before reaching the loop.  So just peel
 840          exit_mod + 1 iterations.  */
 841       if (exit_mod != max_unroll
 842           || desc->noloop_assumptions)
 843         {
 844           bitmap_clear_bit (wont_exit, 0);
 845           if (desc->noloop_assumptions)
 846             bitmap_clear_bit (wont_exit, 1);
 847
 848           opt_info_start_duplication (opt_info);
 849           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 850                                               exit_mod + 1,
 851                                               wont_exit, desc->out_edge,
 852                                               &remove_edges,
 853                                               DLTHE_FLAG_UPDATE_FREQ
 854                                               | (opt_info && exit_mod > 0
 855                                                  ? DLTHE_RECORD_COPY_NUMBER
 856                                                    : 0));
 857           gcc_assert (ok);
 858
 859           if (opt_info && exit_mod > 0)
 860             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 861
 862           desc->niter -= exit_mod + 1;
 863           loop->nb_iterations_upper_bound -= exit_mod + 1;
 864           if (loop->any_estimate
 865               && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
 866             loop->nb_iterations_estimate -= exit_mod + 1;
 867           else
 868             loop->any_estimate = false;
 869           desc->noloop_assumptions = NULL_RTX;
 870
 871           bitmap_set_bit (wont_exit, 0);
 872           bitmap_set_bit (wont_exit, 1);
 873         }
 874
 875       bitmap_clear_bit (wont_exit, max_unroll);
 876     }
 877
 878   /* Now unroll the loop.  */
 879
 880   opt_info_start_duplication (opt_info);
 881   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 882                                       max_unroll,
 883                                       wont_exit, desc->out_edge,
 884                                       &remove_edges,
 885                                       DLTHE_FLAG_UPDATE_FREQ
 886                                       | (opt_info
 887                                          ? DLTHE_RECORD_COPY_NUMBER
 888                                            : 0));
 889   gcc_assert (ok);
 890
 891   if (opt_info)
 892     {
 893       apply_opt_in_copies (opt_info, max_unroll, true, true);
 894       free_opt_info (opt_info);
 895     }
 896
 897   free (wont_exit);
 898
 899   if (exit_at_end)
 900     {
 901       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 902       /* Find a new in and out edge; they are in the last copy we have made.  */
 903
 904       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 905         {
 906           desc->out_edge = EDGE_SUCC (exit_block, 0);
 907           desc->in_edge = EDGE_SUCC (exit_block, 1);
 908         }
 909       else
 910         {
 911           desc->out_edge = EDGE_SUCC (exit_block, 1);
 912           desc->in_edge = EDGE_SUCC (exit_block, 0);
 913         }
 914     }
 915
 916   desc->niter /= max_unroll + 1;
 917   loop->nb_iterations_upper_bound
 918     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
 919   if (loop->any_estimate)
 920     loop->nb_iterations_estimate
 921       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
 922   desc->niter_expr = GEN_INT (desc->niter);
 923
 924   /* Remove the edges.  */
 925   FOR_EACH_VEC_ELT (remove_edges, i, e)
 926     remove_path (e);
 927   remove_edges.release ();
 928
 929   if (dump_file)
 930     fprintf (dump_file,
 931              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 932              max_unroll, num_loop_insns (loop));
 933 }
 934
 935 /* Decide whether to unroll LOOP iterating runtime computable number of times
 936    and how much.  */
 937 static void
 938 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 939 {
 940   unsigned nunroll, nunroll_by_av, i;
 941   struct niter_desc *desc;
 942   widest_int iterations;
 943
 944   if (!(flags & UAP_UNROLL))
 945     {
 946       /* We were not asked to, just return back silently.  */
 947       return;
 948     }
 949
 950   if (dump_file)
 951     fprintf (dump_file,
 952              "\n;; Considering unrolling loop with runtime "
 953              "computable number of iterations\n");
 954
 955   /* nunroll = total number of copies of the original loop body in
 956      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 957   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 958   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 959   if (nunroll > nunroll_by_av)
 960     nunroll = nunroll_by_av;
 961   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 962     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 963
 964   if (targetm.loop_unroll_adjust)
 965     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 966
 967   /* Skip big loops.  */
 968   if (nunroll <= 1)
 969     {
 970       if (dump_file)
 971         fprintf (dump_file, ";; Not considering loop, is too big\n");
 972       return;
 973     }
 974
 975   /* Check for simple loops.  */
 976   desc = get_simple_loop_desc (loop);
 977
 978   /* Check simpleness.  */
 979   if (!desc->simple_p || desc->assumptions)
 980     {
 981       if (dump_file)
 982         fprintf (dump_file,
 983                  ";; Unable to prove that the number of iterations "
 984                  "can be counted in runtime\n");
 985       return;
 986     }
 987
 988   if (desc->const_iter)
 989     {
 990       if (dump_file)
 991         fprintf (dump_file, ";; Loop iterates constant times\n");
 992       return;
 993     }
 994
 995   /* Check whether the loop rolls.  */
 996   if ((get_estimated_loop_iterations (loop, &iterations)
 997        || get_max_loop_iterations (loop, &iterations))
 998       && wi::ltu_p (iterations, 2 * nunroll))
 999     {
1000       if (dump_file)
1001         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1002       return;
1003     }
1004
1005   /* Success; now force nunroll to be power of 2, as we are unable to
1006      cope with overflows in computation of number of iterations.  */
1007   for (i = 1; 2 * i <= nunroll; i *= 2)
1008     continue;
1009
1010   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1011   loop->lpt_decision.times = i - 1;
1012 }
1013
1014 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1015    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1016    and NULL is returned instead.  */
1017
1018 basic_block
1019 split_edge_and_insert (edge e, rtx insns)
1020 {
1021   basic_block bb;
1022
1023   if (!insns)
1024     return NULL;
1025   bb = split_edge (e);
1026   emit_insn_after (insns, BB_END (bb));
1027
1028   /* ??? We used to assume that INSNS can contain control flow insns, and
1029      that we had to try to find sub basic blocks in BB to maintain a valid
1030      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1031      and call break_superblocks when going out of cfglayout mode.  But it
1032      turns out that this never happens; and that if it does ever happen,
1033      the TODO_verify_flow at the end of the RTL loop passes would fail.
1034
1035      There are two reasons why we expected we could have control flow insns
1036      in INSNS.  The first is when a comparison has to be done in parts, and
1037      the second is when the number of iterations is computed for loops with
1038      the number of iterations known at runtime.  In both cases, test cases
1039      to get control flow in INSNS appear to be impossible to construct:
1040
1041       * If do_compare_rtx_and_jump needs several branches to do comparison
1042         in a mode that needs comparison by parts, we cannot analyze the
1043         number of iterations of the loop, and we never get to unrolling it.
1044
1045       * The code in expand_divmod that was suspected to cause creation of
1046         branching code seems to be only accessed for signed division.  The
1047         divisions used by # of iterations analysis are always unsigned.
1048         Problems might arise on architectures that emits branching code
1049         for some operations that may appear in the unroller (especially
1050         for division), but we have no such architectures.
1051
1052      Considering all this, it was decided that we should for now assume
1053      that INSNS can in theory contain control flow insns, but in practice
1054      it never does.  So we don't handle the theoretical case, and should
1055      a real failure ever show up, we have a pretty good clue for how to
1056      fix it.  */
1057
1058   return bb;
1059 }
1060
1061 /* Unroll LOOP for which we are able to count number of iterations in runtime
1062    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1063    extra care for case n < 0):
1064
1065    for (i = 0; i < n; i++)
1066      body;
1067
1068    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1069
1070    i = 0;
1071    mod = n % 4;
1072
1073    switch (mod)
1074      {
1075        case 3:
1076          body; i++;
1077        case 2:
1078          body; i++;
1079        case 1:
1080          body; i++;
1081        case 0: ;
1082      }
1083
1084    while (i < n)
1085      {
1086        body; i++;
1087        body; i++;
1088        body; i++;
1089        body; i++;
1090      }
1091    */
1092 static void
1093 unroll_loop_runtime_iterations (struct loop *loop)
1094 {
1095   rtx old_niter, niter, init_code, branch_code, tmp;
1096   unsigned i, j, p;
1097   basic_block preheader, *body, swtch, ezc_swtch;
1098   vec<basic_block> dom_bbs;
1099   sbitmap wont_exit;
1100   int may_exit_copy;
1101   unsigned n_peel;
1102   vec<edge> remove_edges;
1103   edge e;
1104   bool extra_zero_check, last_may_exit;
1105   unsigned max_unroll = loop->lpt_decision.times;
1106   struct niter_desc *desc = get_simple_loop_desc (loop);
1107   bool exit_at_end = loop_exit_at_end_p (loop);
1108   struct opt_info *opt_info = NULL;
1109   bool ok;
1110
1111   if (flag_split_ivs_in_unroller
1112       || flag_variable_expansion_in_unroller)
1113     opt_info = analyze_insns_in_loop (loop);
1114
1115   /* Remember blocks whose dominators will have to be updated.  */
1116   dom_bbs.create (0);
1117
1118   body = get_loop_body (loop);
1119   for (i = 0; i < loop->num_nodes; i++)
1120     {
1121       vec<basic_block> ldom;
1122       basic_block bb;
1123
1124       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1125       FOR_EACH_VEC_ELT (ldom, j, bb)
1126         if (!flow_bb_inside_loop_p (loop, bb))
1127           dom_bbs.safe_push (bb);
1128
1129       ldom.release ();
1130     }
1131   free (body);
1132
1133   if (!exit_at_end)
1134     {
1135       /* Leave exit in first copy (for explanation why see comment in
1136          unroll_loop_constant_iterations).  */
1137       may_exit_copy = 0;
1138       n_peel = max_unroll - 1;
1139       extra_zero_check = true;
1140       last_may_exit = false;
1141     }
1142   else
1143     {
1144       /* Leave exit in last copy (for explanation why see comment in
1145          unroll_loop_constant_iterations).  */
1146       may_exit_copy = max_unroll;
1147       n_peel = max_unroll;
1148       extra_zero_check = false;
1149       last_may_exit = true;
1150     }
1151
1152   /* Get expression for number of iterations.  */
1153   start_sequence ();
1154   old_niter = niter = gen_reg_rtx (desc->mode);
1155   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1156   if (tmp != niter)
1157     emit_move_insn (niter, tmp);
1158
1159   /* Count modulo by ANDing it with max_unroll; we use the fact that
1160      the number of unrollings is a power of two, and thus this is correct
1161      even if there is overflow in the computation.  */
1162   niter = expand_simple_binop (desc->mode, AND,
1163                                niter, gen_int_mode (max_unroll, desc->mode),
1164                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1165
1166   init_code = get_insns ();
1167   end_sequence ();
1168   unshare_all_rtl_in_chain (init_code);
1169
1170   /* Precondition the loop.  */
1171   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1172
1173   remove_edges.create (0);
1174
1175   wont_exit = sbitmap_alloc (max_unroll + 2);
1176
1177   /* Peel the first copy of loop body (almost always we must leave exit test
1178      here; the only exception is when we have extra zero check and the number
1179      of iterations is reliable.  Also record the place of (possible) extra
1180      zero check.  */
1181   bitmap_clear (wont_exit);
1182   if (extra_zero_check
1183       && !desc->noloop_assumptions)
1184     bitmap_set_bit (wont_exit, 1);
1185   ezc_swtch = loop_preheader_edge (loop)->src;
1186   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1187                                       1, wont_exit, desc->out_edge,
1188                                       &remove_edges,
1189                                       DLTHE_FLAG_UPDATE_FREQ);
1190   gcc_assert (ok);
1191
1192   /* Record the place where switch will be built for preconditioning.  */
1193   swtch = split_edge (loop_preheader_edge (loop));
1194
1195   for (i = 0; i < n_peel; i++)
1196     {
1197       /* Peel the copy.  */
1198       bitmap_clear (wont_exit);
1199       if (i != n_peel - 1 || !last_may_exit)
1200         bitmap_set_bit (wont_exit, 1);
1201       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1202                                           1, wont_exit, desc->out_edge,
1203                                           &remove_edges,
1204                                           DLTHE_FLAG_UPDATE_FREQ);
1205       gcc_assert (ok);
1206
1207       /* Create item for switch.  */
1208       j = n_peel - i - (extra_zero_check ? 0 : 1);
1209       p = REG_BR_PROB_BASE / (i + 2);
1210
1211       preheader = split_edge (loop_preheader_edge (loop));
1212       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1213                                           block_label (preheader), p,
1214                                           NULL_RTX);
1215
1216       /* We rely on the fact that the compare and jump cannot be optimized out,
1217          and hence the cfg we create is correct.  */
1218       gcc_assert (branch_code != NULL_RTX);
1219
1220       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1221       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1222       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1223       e = make_edge (swtch, preheader,
1224                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1225       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1226       e->probability = p;
1227     }
1228
1229   if (extra_zero_check)
1230     {
1231       /* Add branch for zero iterations.  */
1232       p = REG_BR_PROB_BASE / (max_unroll + 1);
1233       swtch = ezc_swtch;
1234       preheader = split_edge (loop_preheader_edge (loop));
1235       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1236                                           block_label (preheader), p,
1237                                           NULL_RTX);
1238       gcc_assert (branch_code != NULL_RTX);
1239
1240       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1241       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1242       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1243       e = make_edge (swtch, preheader,
1244                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1245       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1246       e->probability = p;
1247     }
1248
1249   /* Recount dominators for outer blocks.  */
1250   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1251
1252   /* And unroll loop.  */
1253
1254   bitmap_ones (wont_exit);
1255   bitmap_clear_bit (wont_exit, may_exit_copy);
1256   opt_info_start_duplication (opt_info);
1257
1258   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1259                                       max_unroll,
1260                                       wont_exit, desc->out_edge,
1261                                       &remove_edges,
1262                                       DLTHE_FLAG_UPDATE_FREQ
1263                                       | (opt_info
1264                                          ? DLTHE_RECORD_COPY_NUMBER
1265                                            : 0));
1266   gcc_assert (ok);
1267
1268   if (opt_info)
1269     {
1270       apply_opt_in_copies (opt_info, max_unroll, true, true);
1271       free_opt_info (opt_info);
1272     }
1273
1274   free (wont_exit);
1275
1276   if (exit_at_end)
1277     {
1278       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1279       /* Find a new in and out edge; they are in the last copy we have
1280          made.  */
1281
1282       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1283         {
1284           desc->out_edge = EDGE_SUCC (exit_block, 0);
1285           desc->in_edge = EDGE_SUCC (exit_block, 1);
1286         }
1287       else
1288         {
1289           desc->out_edge = EDGE_SUCC (exit_block, 1);
1290           desc->in_edge = EDGE_SUCC (exit_block, 0);
1291         }
1292     }
1293
1294   /* Remove the edges.  */
1295   FOR_EACH_VEC_ELT (remove_edges, i, e)
1296     remove_path (e);
1297   remove_edges.release ();
1298
1299   /* We must be careful when updating the number of iterations due to
1300      preconditioning and the fact that the value must be valid at entry
1301      of the loop.  After passing through the above code, we see that
1302      the correct new number of iterations is this:  */
1303   gcc_assert (!desc->const_iter);
1304   desc->niter_expr =
1305     simplify_gen_binary (UDIV, desc->mode, old_niter,
1306                          gen_int_mode (max_unroll + 1, desc->mode));
1307   loop->nb_iterations_upper_bound
1308     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1309   if (loop->any_estimate)
1310     loop->nb_iterations_estimate
1311       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1312   if (exit_at_end)
1313     {
1314       desc->niter_expr =
1315         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1316       desc->noloop_assumptions = NULL_RTX;
1317       --loop->nb_iterations_upper_bound;
1318       if (loop->any_estimate
1319           && loop->nb_iterations_estimate != 0)
1320         --loop->nb_iterations_estimate;
1321       else
1322         loop->any_estimate = false;
1323     }
1324
1325   if (dump_file)
1326     fprintf (dump_file,
1327              ";; Unrolled loop %d times, counting # of iterations "
1328              "in runtime, %i insns\n",
1329              max_unroll, num_loop_insns (loop));
1330
1331   dom_bbs.release ();
1332 }
1333
1334 /* Decide whether to simply peel LOOP and how much.  */
1335 static void
1336 decide_peel_simple (struct loop *loop, int flags)
1337 {
1338   unsigned npeel;
1339   widest_int iterations;
1340
1341   if (!(flags & UAP_PEEL))
1342     {
1343       /* We were not asked to, just return back silently.  */
1344       return;
1345     }
1346
1347   if (dump_file)
1348     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1349
1350   /* npeel = number of iterations to peel.  */
1351   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1352   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1353     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1354
1355   /* Skip big loops.  */
1356   if (!npeel)
1357     {
1358       if (dump_file)
1359         fprintf (dump_file, ";; Not considering loop, is too big\n");
1360       return;
1361     }
1362
1363   /* Do not simply peel loops with branches inside -- it increases number
1364      of mispredicts.
1365      Exception is when we do have profile and we however have good chance
1366      to peel proper number of iterations loop will iterate in practice.
1367      TODO: this heuristic needs tunning; while for complette unrolling
1368      the branch inside loop mostly eliminates any improvements, for
1369      peeling it is not the case.  Also a function call inside loop is
1370      also branch from branch prediction POV (and probably better reason
1371      to not unroll/peel).  */
1372   if (num_loop_branches (loop) > 1
1373       && profile_status != PROFILE_READ)
1374     {
1375       if (dump_file)
1376         fprintf (dump_file, ";; Not peeling, contains branches\n");
1377       return;
1378     }
1379
1380   /* If we have realistic estimate on number of iterations, use it.  */
1381   if (get_estimated_loop_iterations (loop, &iterations))
1382     {
1383       /* TODO: unsigned/signed confusion */
1384       if (wi::leu_p (npeel, iterations))
1385         {
1386           if (dump_file)
1387             {
1388               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1389               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1390                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1391               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1392                        npeel);
1393             }
1394           return;
1395         }
1396       npeel = iterations.to_shwi () + 1;
1397     }
1398   /* If we have small enough bound on iterations, we can still peel (completely
1399      unroll).  */
1400   else if (get_max_loop_iterations (loop, &iterations)
1401            && wi::ltu_p (iterations, npeel))
1402     npeel = iterations.to_shwi () + 1;
1403   else
1404     {
1405       /* For now we have no good heuristics to decide whether loop peeling
1406          will be effective, so disable it.  */
1407       if (dump_file)
1408         fprintf (dump_file,
1409                  ";; Not peeling loop, no evidence it will be profitable\n");
1410       return;
1411     }
1412
1413   /* Success.  */
1414   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1415   loop->lpt_decision.times = npeel;
1416 }
1417
1418 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1419
1420    while (cond)
1421      body;
1422
1423    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1424
1425    if (!cond) goto end;
1426    body;
1427    if (!cond) goto end;
1428    body;
1429    if (!cond) goto end;
1430    body;
1431    while (cond)
1432      body;
1433    end: ;
1434    */
1435 static void
1436 peel_loop_simple (struct loop *loop)
1437 {
1438   sbitmap wont_exit;
1439   unsigned npeel = loop->lpt_decision.times;
1440   struct niter_desc *desc = get_simple_loop_desc (loop);
1441   struct opt_info *opt_info = NULL;
1442   bool ok;
1443
1444   if (flag_split_ivs_in_unroller && npeel > 1)
1445     opt_info = analyze_insns_in_loop (loop);
1446
1447   wont_exit = sbitmap_alloc (npeel + 1);
1448   bitmap_clear (wont_exit);
1449
1450   opt_info_start_duplication (opt_info);
1451
1452   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1453                                       npeel, wont_exit, NULL,
1454                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1455                                       | (opt_info
1456                                          ? DLTHE_RECORD_COPY_NUMBER
1457                                            : 0));
1458   gcc_assert (ok);
1459
1460   free (wont_exit);
1461
1462   if (opt_info)
1463     {
1464       apply_opt_in_copies (opt_info, npeel, false, false);
1465       free_opt_info (opt_info);
1466     }
1467
1468   if (desc->simple_p)
1469     {
1470       if (desc->const_iter)
1471         {
1472           desc->niter -= npeel;
1473           desc->niter_expr = GEN_INT (desc->niter);
1474           desc->noloop_assumptions = NULL_RTX;
1475         }
1476       else
1477         {
1478           /* We cannot just update niter_expr, as its value might be clobbered
1479              inside loop.  We could handle this by counting the number into
1480              temporary just like we do in runtime unrolling, but it does not
1481              seem worthwhile.  */
1482           free_simple_loop_desc (loop);
1483         }
1484     }
1485   if (dump_file)
1486     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1487 }
1488
1489 /* Decide whether to unroll LOOP stupidly and how much.  */
1490 static void
1491 decide_unroll_stupid (struct loop *loop, int flags)
1492 {
1493   unsigned nunroll, nunroll_by_av, i;
1494   struct niter_desc *desc;
1495   widest_int iterations;
1496
1497   if (!(flags & UAP_UNROLL_ALL))
1498     {
1499       /* We were not asked to, just return back silently.  */
1500       return;
1501     }
1502
1503   if (dump_file)
1504     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1505
1506   /* nunroll = total number of copies of the original loop body in
1507      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1508   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1509   nunroll_by_av
1510     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1511   if (nunroll > nunroll_by_av)
1512     nunroll = nunroll_by_av;
1513   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1514     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1515
1516   if (targetm.loop_unroll_adjust)
1517     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1518
1519   /* Skip big loops.  */
1520   if (nunroll <= 1)
1521     {
1522       if (dump_file)
1523         fprintf (dump_file, ";; Not considering loop, is too big\n");
1524       return;
1525     }
1526
1527   /* Check for simple loops.  */
1528   desc = get_simple_loop_desc (loop);
1529
1530   /* Check simpleness.  */
1531   if (desc->simple_p && !desc->assumptions)
1532     {
1533       if (dump_file)
1534         fprintf (dump_file, ";; The loop is simple\n");
1535       return;
1536     }
1537
1538   /* Do not unroll loops with branches inside -- it increases number
1539      of mispredicts.
1540      TODO: this heuristic needs tunning; call inside the loop body
1541      is also relatively good reason to not unroll.  */
1542   if (num_loop_branches (loop) > 1)
1543     {
1544       if (dump_file)
1545         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1546       return;
1547     }
1548
1549   /* Check whether the loop rolls.  */
1550   if ((get_estimated_loop_iterations (loop, &iterations)
1551        || get_max_loop_iterations (loop, &iterations))
1552       && wi::ltu_p (iterations, 2 * nunroll))
1553     {
1554       if (dump_file)
1555         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1556       return;
1557     }
1558
1559   /* Success.  Now force nunroll to be power of 2, as it seems that this
1560      improves results (partially because of better alignments, partially
1561      because of some dark magic).  */
1562   for (i = 1; 2 * i <= nunroll; i *= 2)
1563     continue;
1564
1565   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1566   loop->lpt_decision.times = i - 1;
1567 }
1568
1569 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1570
1571    while (cond)
1572      body;
1573
1574    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1575
1576    while (cond)
1577      {
1578        body;
1579        if (!cond) break;
1580        body;
1581        if (!cond) break;
1582        body;
1583        if (!cond) break;
1584        body;
1585      }
1586    */
1587 static void
1588 unroll_loop_stupid (struct loop *loop)
1589 {
1590   sbitmap wont_exit;
1591   unsigned nunroll = loop->lpt_decision.times;
1592   struct niter_desc *desc = get_simple_loop_desc (loop);
1593   struct opt_info *opt_info = NULL;
1594   bool ok;
1595
1596   if (flag_split_ivs_in_unroller
1597       || flag_variable_expansion_in_unroller)
1598     opt_info = analyze_insns_in_loop (loop);
1599
1600
1601   wont_exit = sbitmap_alloc (nunroll + 1);
1602   bitmap_clear (wont_exit);
1603   opt_info_start_duplication (opt_info);
1604
1605   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1606                                       nunroll, wont_exit,
1607                                       NULL, NULL,
1608                                       DLTHE_FLAG_UPDATE_FREQ
1609                                       | (opt_info
1610                                          ? DLTHE_RECORD_COPY_NUMBER
1611                                            : 0));
1612   gcc_assert (ok);
1613
1614   if (opt_info)
1615     {
1616       apply_opt_in_copies (opt_info, nunroll, true, true);
1617       free_opt_info (opt_info);
1618     }
1619
1620   free (wont_exit);
1621
1622   if (desc->simple_p)
1623     {
1624       /* We indeed may get here provided that there are nontrivial assumptions
1625          for a loop to be really simple.  We could update the counts, but the
1626          problem is that we are unable to decide which exit will be taken
1627          (not really true in case the number of iterations is constant,
1628          but no one will do anything with this information, so we do not
1629          worry about it).  */
1630       desc->simple_p = false;
1631     }
1632
1633   if (dump_file)
1634     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1635              nunroll, num_loop_insns (loop));
1636 }
1637
1638 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1639    Set *DEBUG_USES to the number of debug insns that reference the
1640    variable.  */
1641
1642 bool
1643 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1644                                   int *debug_uses)
1645 {
1646   basic_block *body, bb;
1647   unsigned i;
1648   int count_ref = 0;
1649   rtx insn;
1650
1651   body = get_loop_body (loop);
1652   for (i = 0; i < loop->num_nodes; i++)
1653     {
1654       bb = body[i];
1655
1656       FOR_BB_INSNS (bb, insn)
1657         if (!rtx_referenced_p (reg, insn))
1658           continue;
1659         else if (DEBUG_INSN_P (insn))
1660           ++*debug_uses;
1661         else if (++count_ref > 1)
1662           break;
1663     }
1664   free (body);
1665   return (count_ref  == 1);
1666 }
1667
1668 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1669
1670 static void
1671 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1672 {
1673   basic_block *body, bb;
1674   unsigned i;
1675   rtx insn;
1676
1677   body = get_loop_body (loop);
1678   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1679     {
1680       bb = body[i];
1681
1682       FOR_BB_INSNS (bb, insn)
1683         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1684           continue;
1685         else
1686           {
1687             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1688                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1689             if (!--debug_uses)
1690               break;
1691           }
1692     }
1693   free (body);
1694 }
1695
1696 /* Determine whether INSN contains an accumulator
1697    which can be expanded into separate copies,
1698    one for each copy of the LOOP body.
1699
1700    for (i = 0 ; i < n; i++)
1701      sum += a[i];
1702
1703    ==>
1704
1705    sum += a[i]
1706    ....
1707    i = i+1;
1708    sum1 += a[i]
1709    ....
1710    i = i+1
1711    sum2 += a[i];
1712    ....
1713
1714    Return NULL if INSN contains no opportunity for expansion of accumulator.
1715    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1716    information and return a pointer to it.
1717 */
1718
1719 static struct var_to_expand *
1720 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1721 {
1722   rtx set, dest, src;
1723   struct var_to_expand *ves;
1724   unsigned accum_pos;
1725   enum rtx_code code;
1726   int debug_uses = 0;
1727
1728   set = single_set (insn);
1729   if (!set)
1730     return NULL;
1731
1732   dest = SET_DEST (set);
1733   src = SET_SRC (set);
1734   code = GET_CODE (src);
1735
1736   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1737     return NULL;
1738
1739   if (FLOAT_MODE_P (GET_MODE (dest)))
1740     {
1741       if (!flag_associative_math)
1742         return NULL;
1743       /* In the case of FMA, we're also changing the rounding.  */
1744       if (code == FMA && !flag_unsafe_math_optimizations)
1745         return NULL;
1746     }
1747
1748   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1749      in MD.  But if there is no optab to generate the insn, we can not
1750      perform the variable expansion.  This can happen if an MD provides
1751      an insn but not a named pattern to generate it, for example to avoid
1752      producing code that needs additional mode switches like for x87/mmx.
1753
1754      So we check have_insn_for which looks for an optab for the operation
1755      in SRC.  If it doesn't exist, we can't perform the expansion even
1756      though INSN is valid.  */
1757   if (!have_insn_for (code, GET_MODE (src)))
1758     return NULL;
1759
1760   if (!REG_P (dest)
1761       && !(GET_CODE (dest) == SUBREG
1762            && REG_P (SUBREG_REG (dest))))
1763     return NULL;
1764
1765   /* Find the accumulator use within the operation.  */
1766   if (code == FMA)
1767     {
1768       /* We only support accumulation via FMA in the ADD position.  */
1769       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1770         return NULL;
1771       accum_pos = 2;
1772     }
1773   else if (rtx_equal_p (dest, XEXP (src, 0)))
1774     accum_pos = 0;
1775   else if (rtx_equal_p (dest, XEXP (src, 1)))
1776     {
1777       /* The method of expansion that we are using; which includes the
1778          initialization of the expansions with zero and the summation of
1779          the expansions at the end of the computation will yield wrong
1780          results for (x = something - x) thus avoid using it in that case.  */
1781       if (code == MINUS)
1782         return NULL;
1783       accum_pos = 1;
1784     }
1785   else
1786     return NULL;
1787
1788   /* It must not otherwise be used.  */
1789   if (code == FMA)
1790     {
1791       if (rtx_referenced_p (dest, XEXP (src, 0))
1792           || rtx_referenced_p (dest, XEXP (src, 1)))
1793         return NULL;
1794     }
1795   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1796     return NULL;
1797
1798   /* It must be used in exactly one insn.  */
1799   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1800     return NULL;
1801
1802   if (dump_file)
1803     {
1804       fprintf (dump_file, "\n;; Expanding Accumulator ");
1805       print_rtl (dump_file, dest);
1806       fprintf (dump_file, "\n");
1807     }
1808
1809   if (debug_uses)
1810     /* Instead of resetting the debug insns, we could replace each
1811        debug use in the loop with the sum or product of all expanded
1812        accummulators.  Since we'll only know of all expansions at the
1813        end, we'd have to keep track of which vars_to_expand a debug
1814        insn in the loop references, take note of each copy of the
1815        debug insn during unrolling, and when it's all done, compute
1816        the sum or product of each variable and adjust the original
1817        debug insn and each copy thereof.  What a pain!  */
1818     reset_debug_uses_in_loop (loop, dest, debug_uses);
1819
1820   /* Record the accumulator to expand.  */
1821   ves = XNEW (struct var_to_expand);
1822   ves->insn = insn;
1823   ves->reg = copy_rtx (dest);
1824   ves->var_expansions.create (1);
1825   ves->next = NULL;
1826   ves->op = GET_CODE (src);
1827   ves->expansion_count = 0;
1828   ves->reuse_expansion = 0;
1829   return ves;
1830 }
1831
1832 /* Determine whether there is an induction variable in INSN that
1833    we would like to split during unrolling.
1834
1835    I.e. replace
1836
1837    i = i + 1;
1838    ...
1839    i = i + 1;
1840    ...
1841    i = i + 1;
1842    ...
1843
1844    type chains by
1845
1846    i0 = i + 1
1847    ...
1848    i = i0 + 1
1849    ...
1850    i = i0 + 2
1851    ...
1852
1853    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1854    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1855    pointer to it.  */
1856
1857 static struct iv_to_split *
1858 analyze_iv_to_split_insn (rtx insn)
1859 {
1860   rtx set, dest;
1861   struct rtx_iv iv;
1862   struct iv_to_split *ivts;
1863   bool ok;
1864
1865   /* For now we just split the basic induction variables.  Later this may be
1866      extended for example by selecting also addresses of memory references.  */
1867   set = single_set (insn);
1868   if (!set)
1869     return NULL;
1870
1871   dest = SET_DEST (set);
1872   if (!REG_P (dest))
1873     return NULL;
1874
1875   if (!biv_p (insn, dest))
1876     return NULL;
1877
1878   ok = iv_analyze_result (insn, dest, &iv);
1879
1880   /* This used to be an assert under the assumption that if biv_p returns
1881      true that iv_analyze_result must also return true.  However, that
1882      assumption is not strictly correct as evidenced by pr25569.
1883
1884      Returning NULL when iv_analyze_result returns false is safe and
1885      avoids the problems in pr25569 until the iv_analyze_* routines
1886      can be fixed, which is apparently hard and time consuming
1887      according to their author.  */
1888   if (! ok)
1889     return NULL;
1890
1891   if (iv.step == const0_rtx
1892       || iv.mode != iv.extend_mode)
1893     return NULL;
1894
1895   /* Record the insn to split.  */
1896   ivts = XNEW (struct iv_to_split);
1897   ivts->insn = insn;
1898   ivts->orig_var = dest;
1899   ivts->base_var = NULL_RTX;
1900   ivts->step = iv.step;
1901   ivts->next = NULL;
1902   ivts->n_loc = 1;
1903   ivts->loc[0] = 1;
1904
1905   return ivts;
1906 }
1907
1908 /* Determines which of insns in LOOP can be optimized.
1909    Return a OPT_INFO struct with the relevant hash tables filled
1910    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1911    is undefined for the return value.  */
1912
1913 static struct opt_info *
1914 analyze_insns_in_loop (struct loop *loop)
1915 {
1916   basic_block *body, bb;
1917   unsigned i;
1918   struct opt_info *opt_info = XCNEW (struct opt_info);
1919   rtx insn;
1920   struct iv_to_split *ivts = NULL;
1921   struct var_to_expand *ves = NULL;
1922   iv_to_split **slot1;
1923   var_to_expand **slot2;
1924   vec<edge> edges = get_loop_exit_edges (loop);
1925   edge exit;
1926   bool can_apply = false;
1927
1928   iv_analysis_loop_init (loop);
1929
1930   body = get_loop_body (loop);
1931
1932   if (flag_split_ivs_in_unroller)
1933     {
1934       opt_info->insns_to_split.create (5 * loop->num_nodes);
1935       opt_info->iv_to_split_head = NULL;
1936       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1937     }
1938
1939   /* Record the loop exit bb and loop preheader before the unrolling.  */
1940   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1941
1942   if (edges.length () == 1)
1943     {
1944       exit = edges[0];
1945       if (!(exit->flags & EDGE_COMPLEX))
1946         {
1947           opt_info->loop_exit = split_edge (exit);
1948           can_apply = true;
1949         }
1950     }
1951
1952   if (flag_variable_expansion_in_unroller
1953       && can_apply)
1954     {
1955       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1956       opt_info->var_to_expand_head = NULL;
1957       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1958     }
1959
1960   for (i = 0; i < loop->num_nodes; i++)
1961     {
1962       bb = body[i];
1963       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1964         continue;
1965
1966       FOR_BB_INSNS (bb, insn)
1967       {
1968         if (!INSN_P (insn))
1969           continue;
1970
1971         if (opt_info->insns_to_split.is_created ())
1972           ivts = analyze_iv_to_split_insn (insn);
1973
1974         if (ivts)
1975           {
1976             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1977             gcc_assert (*slot1 == NULL);
1978             *slot1 = ivts;
1979             *opt_info->iv_to_split_tail = ivts;
1980             opt_info->iv_to_split_tail = &ivts->next;
1981             continue;
1982           }
1983
1984         if (opt_info->insns_with_var_to_expand.is_created ())
1985           ves = analyze_insn_to_expand_var (loop, insn);
1986
1987         if (ves)
1988           {
1989             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1990             gcc_assert (*slot2 == NULL);
1991             *slot2 = ves;
1992             *opt_info->var_to_expand_tail = ves;
1993             opt_info->var_to_expand_tail = &ves->next;
1994           }
1995       }
1996     }
1997
1998   edges.release ();
1999   free (body);
2000   return opt_info;
2001 }
2002
2003 /* Called just before loop duplication.  Records start of duplicated area
2004    to OPT_INFO.  */
2005
2006 static void
2007 opt_info_start_duplication (struct opt_info *opt_info)
2008 {
2009   if (opt_info)
2010     opt_info->first_new_block = last_basic_block;
2011 }
2012
2013 /* Determine the number of iterations between initialization of the base
2014    variable and the current copy (N_COPY).  N_COPIES is the total number
2015    of newly created copies.  UNROLLING is true if we are unrolling
2016    (not peeling) the loop.  */
2017
2018 static unsigned
2019 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2020 {
2021   if (unrolling)
2022     {
2023       /* If we are unrolling, initialization is done in the original loop
2024          body (number 0).  */
2025       return n_copy;
2026     }
2027   else
2028     {
2029       /* If we are peeling, the copy in that the initialization occurs has
2030          number 1.  The original loop (number 0) is the last.  */
2031       if (n_copy)
2032         return n_copy - 1;
2033       else
2034         return n_copies;
2035     }
2036 }
2037
2038 /* Locate in EXPR the expression corresponding to the location recorded
2039    in IVTS, and return a pointer to the RTX for this location.  */
2040
2041 static rtx *
2042 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2043 {
2044   unsigned i;
2045   rtx *ret = &expr;
2046
2047   for (i = 0; i < ivts->n_loc; i++)
2048     ret = &XEXP (*ret, ivts->loc[i]);
2049
2050   return ret;
2051 }
2052
2053 /* Allocate basic variable for the induction variable chain.  */
2054
2055 static void
2056 allocate_basic_variable (struct iv_to_split *ivts)
2057 {
2058   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2059
2060   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2061 }
2062
2063 /* Insert initialization of basic variable of IVTS before INSN, taking
2064    the initial value from INSN.  */
2065
2066 static void
2067 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2068 {
2069   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2070   rtx seq;
2071
2072   start_sequence ();
2073   expr = force_operand (expr, ivts->base_var);
2074   if (expr != ivts->base_var)
2075     emit_move_insn (ivts->base_var, expr);
2076   seq = get_insns ();
2077   end_sequence ();
2078
2079   emit_insn_before (seq, insn);
2080 }
2081
2082 /* Replace the use of induction variable described in IVTS in INSN
2083    by base variable + DELTA * step.  */
2084
2085 static void
2086 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2087 {
2088   rtx expr, *loc, seq, incr, var;
2089   enum machine_mode mode = GET_MODE (ivts->base_var);
2090   rtx src, dest, set;
2091
2092   /* Construct base + DELTA * step.  */
2093   if (!delta)
2094     expr = ivts->base_var;
2095   else
2096     {
2097       incr = simplify_gen_binary (MULT, mode,
2098                                   ivts->step, gen_int_mode (delta, mode));
2099       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2100                                   ivts->base_var, incr);
2101     }
2102
2103   /* Figure out where to do the replacement.  */
2104   loc = get_ivts_expr (single_set (insn), ivts);
2105
2106   /* If we can make the replacement right away, we're done.  */
2107   if (validate_change (insn, loc, expr, 0))
2108     return;
2109
2110   /* Otherwise, force EXPR into a register and try again.  */
2111   start_sequence ();
2112   var = gen_reg_rtx (mode);
2113   expr = force_operand (expr, var);
2114   if (expr != var)
2115     emit_move_insn (var, expr);
2116   seq = get_insns ();
2117   end_sequence ();
2118   emit_insn_before (seq, insn);
2119
2120   if (validate_change (insn, loc, var, 0))
2121     return;
2122
2123   /* The last chance.  Try recreating the assignment in insn
2124      completely from scratch.  */
2125   set = single_set (insn);
2126   gcc_assert (set);
2127
2128   start_sequence ();
2129   *loc = var;
2130   src = copy_rtx (SET_SRC (set));
2131   dest = copy_rtx (SET_DEST (set));
2132   src = force_operand (src, dest);
2133   if (src != dest)
2134     emit_move_insn (dest, src);
2135   seq = get_insns ();
2136   end_sequence ();
2137
2138   emit_insn_before (seq, insn);
2139   delete_insn (insn);
2140 }
2141
2142
2143 /* Return one expansion of the accumulator recorded in struct VE.  */
2144
2145 static rtx
2146 get_expansion (struct var_to_expand *ve)
2147 {
2148   rtx reg;
2149
2150   if (ve->reuse_expansion == 0)
2151     reg = ve->reg;
2152   else
2153     reg = ve->var_expansions[ve->reuse_expansion - 1];
2154
2155   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2156     ve->reuse_expansion = 0;
2157   else
2158     ve->reuse_expansion++;
2159
2160   return reg;
2161 }
2162
2163
2164 /* Given INSN replace the uses of the accumulator recorded in VE
2165    with a new register.  */
2166
2167 static void
2168 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2169 {
2170   rtx new_reg, set;
2171   bool really_new_expansion = false;
2172
2173   set = single_set (insn);
2174   gcc_assert (set);
2175
2176   /* Generate a new register only if the expansion limit has not been
2177      reached.  Else reuse an already existing expansion.  */
2178   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2179     {
2180       really_new_expansion = true;
2181       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2182     }
2183   else
2184     new_reg = get_expansion (ve);
2185
2186   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2187   if (apply_change_group ())
2188     if (really_new_expansion)
2189       {
2190         ve->var_expansions.safe_push (new_reg);
2191         ve->expansion_count++;
2192       }
2193 }
2194
2195 /* Initialize the variable expansions in loop preheader.  PLACE is the
2196    loop-preheader basic block where the initialization of the
2197    expansions should take place.  The expansions are initialized with
2198    (-0) when the operation is plus or minus to honor sign zero.  This
2199    way we can prevent cases where the sign of the final result is
2200    effected by the sign of the expansion.  Here is an example to
2201    demonstrate this:
2202
2203    for (i = 0 ; i < n; i++)
2204      sum += something;
2205
2206    ==>
2207
2208    sum += something
2209    ....
2210    i = i+1;
2211    sum1 += something
2212    ....
2213    i = i+1
2214    sum2 += something;
2215    ....
2216
2217    When SUM is initialized with -zero and SOMETHING is also -zero; the
2218    final result of sum should be -zero thus the expansions sum1 and sum2
2219    should be initialized with -zero as well (otherwise we will get +zero
2220    as the final result).  */
2221
2222 static void
2223 insert_var_expansion_initialization (struct var_to_expand *ve,
2224                                      basic_block place)
2225 {
2226   rtx seq, var, zero_init;
2227   unsigned i;
2228   enum machine_mode mode = GET_MODE (ve->reg);
2229   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2230
2231   if (ve->var_expansions.length () == 0)
2232     return;
2233
2234   start_sequence ();
2235   switch (ve->op)
2236     {
2237     case FMA:
2238       /* Note that we only accumulate FMA via the ADD operand.  */
2239     case PLUS:
2240     case MINUS:
2241       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2242         {
2243           if (honor_signed_zero_p)
2244             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2245           else
2246             zero_init = CONST0_RTX (mode);
2247           emit_move_insn (var, zero_init);
2248         }
2249       break;
2250
2251     case MULT:
2252       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2253         {
2254           zero_init = CONST1_RTX (GET_MODE (var));
2255           emit_move_insn (var, zero_init);
2256         }
2257       break;
2258
2259     default:
2260       gcc_unreachable ();
2261     }
2262
2263   seq = get_insns ();
2264   end_sequence ();
2265
2266   emit_insn_after (seq, BB_END (place));
2267 }
2268
2269 /* Combine the variable expansions at the loop exit.  PLACE is the
2270    loop exit basic block where the summation of the expansions should
2271    take place.  */
2272
2273 static void
2274 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2275 {
2276   rtx sum = ve->reg;
2277   rtx expr, seq, var, insn;
2278   unsigned i;
2279
2280   if (ve->var_expansions.length () == 0)
2281     return;
2282
2283   start_sequence ();
2284   switch (ve->op)
2285     {
2286     case FMA:
2287       /* Note that we only accumulate FMA via the ADD operand.  */
2288     case PLUS:
2289     case MINUS:
2290       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2291         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2292       break;
2293
2294     case MULT:
2295       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2296         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2297       break;
2298
2299     default:
2300       gcc_unreachable ();
2301     }
2302
2303   expr = force_operand (sum, ve->reg);
2304   if (expr != ve->reg)
2305     emit_move_insn (ve->reg, expr);
2306   seq = get_insns ();
2307   end_sequence ();
2308
2309   insn = BB_HEAD (place);
2310   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2311     insn = NEXT_INSN (insn);
2312
2313   emit_insn_after (seq, insn);
2314 }
2315
2316 /* Strip away REG_EQUAL notes for IVs we're splitting.
2317
2318    Updating REG_EQUAL notes for IVs we split is tricky: We
2319    cannot tell until after unrolling, DF-rescanning, and liveness
2320    updating, whether an EQ_USE is reached by the split IV while
2321    the IV reg is still live.  See PR55006.
2322
2323    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2324    because RTL loop-iv requires us to defer rescanning insns and
2325    any notes attached to them.  So resort to old techniques...  */
2326
2327 static void
2328 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2329 {
2330   struct iv_to_split *ivts;
2331   rtx note = find_reg_equal_equiv_note (insn);
2332   if (! note)
2333     return;
2334   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2335     if (reg_mentioned_p (ivts->orig_var, note))
2336       {
2337         remove_note (insn, note);
2338         return;
2339       }
2340 }
2341
2342 /* Apply loop optimizations in loop copies using the
2343    data which gathered during the unrolling.  Structure
2344    OPT_INFO record that data.
2345
2346    UNROLLING is true if we unrolled (not peeled) the loop.
2347    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2348    the loop (as it should happen in complete unrolling, but not in ordinary
2349    peeling of the loop).  */
2350
2351 static void
2352 apply_opt_in_copies (struct opt_info *opt_info,
2353                      unsigned n_copies, bool unrolling,
2354                      bool rewrite_original_loop)
2355 {
2356   unsigned i, delta;
2357   basic_block bb, orig_bb;
2358   rtx insn, orig_insn, next;
2359   struct iv_to_split ivts_templ, *ivts;
2360   struct var_to_expand ve_templ, *ves;
2361
2362   /* Sanity check -- we need to put initialization in the original loop
2363      body.  */
2364   gcc_assert (!unrolling || rewrite_original_loop);
2365
2366   /* Allocate the basic variables (i0).  */
2367   if (opt_info->insns_to_split.is_created ())
2368     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2369       allocate_basic_variable (ivts);
2370
2371   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2372     {
2373       bb = BASIC_BLOCK (i);
2374       orig_bb = get_bb_original (bb);
2375
2376       /* bb->aux holds position in copy sequence initialized by
2377          duplicate_loop_to_header_edge.  */
2378       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2379                                         unrolling);
2380       bb->aux = 0;
2381       orig_insn = BB_HEAD (orig_bb);
2382       FOR_BB_INSNS_SAFE (bb, insn, next)
2383         {
2384           if (!INSN_P (insn)
2385               || (DEBUG_INSN_P (insn)
2386                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2387             continue;
2388
2389           while (!INSN_P (orig_insn)
2390                  || (DEBUG_INSN_P (orig_insn)
2391                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2392                          == LABEL_DECL)))
2393             orig_insn = NEXT_INSN (orig_insn);
2394
2395           ivts_templ.insn = orig_insn;
2396           ve_templ.insn = orig_insn;
2397
2398           /* Apply splitting iv optimization.  */
2399           if (opt_info->insns_to_split.is_created ())
2400             {
2401               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2402
2403               ivts = opt_info->insns_to_split.find (&ivts_templ);
2404
2405               if (ivts)
2406                 {
2407                   gcc_assert (GET_CODE (PATTERN (insn))
2408                               == GET_CODE (PATTERN (orig_insn)));
2409
2410                   if (!delta)
2411                     insert_base_initialization (ivts, insn);
2412                   split_iv (ivts, insn, delta);
2413                 }
2414             }
2415           /* Apply variable expansion optimization.  */
2416           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2417             {
2418               ves = (struct var_to_expand *)
2419                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2420               if (ves)
2421                 {
2422                   gcc_assert (GET_CODE (PATTERN (insn))
2423                               == GET_CODE (PATTERN (orig_insn)));
2424                   expand_var_during_unrolling (ves, insn);
2425                 }
2426             }
2427           orig_insn = NEXT_INSN (orig_insn);
2428         }
2429     }
2430
2431   if (!rewrite_original_loop)
2432     return;
2433
2434   /* Initialize the variable expansions in the loop preheader
2435      and take care of combining them at the loop exit.  */
2436   if (opt_info->insns_with_var_to_expand.is_created ())
2437     {
2438       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2439         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2440       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2441         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2442     }
2443
2444   /* Rewrite also the original loop body.  Find them as originals of the blocks
2445      in the last copied iteration, i.e. those that have
2446      get_bb_copy (get_bb_original (bb)) == bb.  */
2447   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2448     {
2449       bb = BASIC_BLOCK (i);
2450       orig_bb = get_bb_original (bb);
2451       if (get_bb_copy (orig_bb) != bb)
2452         continue;
2453
2454       delta = determine_split_iv_delta (0, n_copies, unrolling);
2455       for (orig_insn = BB_HEAD (orig_bb);
2456            orig_insn != NEXT_INSN (BB_END (bb));
2457            orig_insn = next)
2458         {
2459           next = NEXT_INSN (orig_insn);
2460
2461           if (!INSN_P (orig_insn))
2462             continue;
2463
2464           ivts_templ.insn = orig_insn;
2465           if (opt_info->insns_to_split.is_created ())
2466             {
2467               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2468
2469               ivts = (struct iv_to_split *)
2470                 opt_info->insns_to_split.find (&ivts_templ);
2471               if (ivts)
2472                 {
2473                   if (!delta)
2474                     insert_base_initialization (ivts, orig_insn);
2475                   split_iv (ivts, orig_insn, delta);
2476                   continue;
2477                 }
2478             }
2479
2480         }
2481     }
2482 }
2483
2484 /* Release OPT_INFO.  */
2485
2486 static void
2487 free_opt_info (struct opt_info *opt_info)
2488 {
2489   if (opt_info->insns_to_split.is_created ())
2490     opt_info->insns_to_split.dispose ();
2491   if (opt_info->insns_with_var_to_expand.is_created ())
2492     {
2493       struct var_to_expand *ves;
2494
2495       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2496         ves->var_expansions.release ();
2497       opt_info->insns_with_var_to_expand.dispose ();
2498     }
2499   free (opt_info);
2500 }