gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Hashtable helper for iv_to_split.  */
 107
 108 struct iv_split_hasher : typed_free_remove <iv_to_split>
 109 {
 110   typedef iv_to_split value_type;
 111   typedef iv_to_split compare_type;
 112   static inline hashval_t hash (const value_type *);
 113   static inline bool equal (const value_type *, const compare_type *);
 114 };
 115
 116
 117 /* A hash function for information about insns to split.  */
 118
 119 inline hashval_t
 120 iv_split_hasher::hash (const value_type *ivts)
 121 {
 122   return (hashval_t) INSN_UID (ivts->insn);
 123 }
 124
 125 /* An equality functions for information about insns to split.  */
 126
 127 inline bool
 128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 129 {
 130   return i1->insn == i2->insn;
 131 }
 132
 133 /* Hashtable helper for iv_to_split.  */
 134
 135 struct var_expand_hasher : typed_free_remove <var_to_expand>
 136 {
 137   typedef var_to_expand value_type;
 138   typedef var_to_expand compare_type;
 139   static inline hashval_t hash (const value_type *);
 140   static inline bool equal (const value_type *, const compare_type *);
 141 };
 142
 143 /* Return a hash for VES.  */
 144
 145 inline hashval_t
 146 var_expand_hasher::hash (const value_type *ves)
 147 {
 148   return (hashval_t) INSN_UID (ves->insn);
 149 }
 150
 151 /* Return true if I1 and I2 refer to the same instruction.  */
 152
 153 inline bool
 154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 155 {
 156   return i1->insn == i2->insn;
 157 }
 158
 159 /* Information about optimization applied in
 160    the unrolled loop.  */
 161
 162 struct opt_info
 163 {
 164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 165                                                   split.  */
 166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 169                                         insns with accumulators to expand.  */
 170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 172   unsigned first_new_block;        /* The first basic block that was
 173                                       duplicated.  */
 174   basic_block loop_exit;           /* The loop exit basic block.  */
 175   basic_block loop_preheader;      /* The loop preheader basic block.  */
 176 };
 177
 178 static void decide_unrolling_and_peeling (int);
 179 static void peel_loops_completely (int);
 180 static void decide_peel_simple (struct loop *, int);
 181 static void decide_peel_once_rolling (struct loop *, int);
 182 static void decide_peel_completely (struct loop *, int);
 183 static void decide_unroll_stupid (struct loop *, int);
 184 static void decide_unroll_constant_iterations (struct loop *, int);
 185 static void decide_unroll_runtime_iterations (struct loop *, int);
 186 static void peel_loop_simple (struct loop *);
 187 static void peel_loop_completely (struct loop *);
 188 static void unroll_loop_stupid (struct loop *);
 189 static void unroll_loop_constant_iterations (struct loop *);
 190 static void unroll_loop_runtime_iterations (struct loop *);
 191 static struct opt_info *analyze_insns_in_loop (struct loop *);
 192 static void opt_info_start_duplication (struct opt_info *);
 193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 194 static void free_opt_info (struct opt_info *);
 195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 199 static void insert_var_expansion_initialization (struct var_to_expand *,
 200                                                  basic_block);
 201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 202                                              basic_block);
 203 static rtx get_expansion (struct var_to_expand *);
 204
 205 /* Emit a message summarizing the unroll or peel that will be
 206    performed for LOOP, along with the loop's location LOCUS, if
 207    appropriate given the dump or -fopt-info settings.  */
 208
 209 static void
 210 report_unroll_peel (struct loop *loop, location_t locus)
 211 {
 212   struct niter_desc *desc;
 213   int niters = 0;
 214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 215
 216   if (loop->lpt_decision.decision == LPT_NONE)
 217     return;
 218
 219   if (!dump_enabled_p ())
 220     return;
 221
 222   /* In the special case where the loop never iterated, emit
 223      a different message so that we don't report an unroll by 0.
 224      This matches the equivalent message emitted during tree unrolling.  */
 225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 226       && !loop->lpt_decision.times)
 227     {
 228       dump_printf_loc (report_flags, locus,
 229                        "loop turned into non-loop; it never loops.\n");
 230       return;
 231     }
 232
 233   desc = get_simple_loop_desc (loop);
 234
 235   if (desc->const_iter)
 236     niters = desc->niter;
 237   else if (loop->header->count)
 238     niters = expected_loop_iterations (loop);
 239
 240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 241     dump_printf_loc (report_flags, locus,
 242                      "loop with %d iterations completely unrolled",
 243                      loop->lpt_decision.times + 1);
 244   else
 245     dump_printf_loc (report_flags, locus,
 246                      "loop %s %d times",
 247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 248                        ? "peeled" : "unrolled"),
 249                      loop->lpt_decision.times);
 250   if (profile_info)
 251     dump_printf (report_flags,
 252                  " (header execution count %d",
 253                  (int)loop->header->count);
 254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255     dump_printf (report_flags,
 256                  "%s%s iterations %d)",
 257                  profile_info ? ", " : " (",
 258                  desc->const_iter ? "const" : "average",
 259                  niters);
 260   else if (profile_info)
 261     dump_printf (report_flags, ")");
 262
 263   dump_printf (report_flags, "\n");
 264 }
 265
 266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 267 void
 268 unroll_and_peel_loops (int flags)
 269 {
 270   struct loop *loop;
 271   bool changed = false;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   bool changed = false;
 348
 349   /* Scan the loops, the inner ones first.  */
 350   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 351     {
 352       loop->lpt_decision.decision = LPT_NONE;
 353       location_t locus = get_loop_location (loop);
 354
 355       if (dump_enabled_p ())
 356         dump_printf_loc (TDF_RTL, locus,
 357                          ";; *** Considering loop %d at BB %d for "
 358                          "complete peeling ***\n",
 359                          loop->num, loop->header->index);
 360
 361       loop->ninsns = num_loop_insns (loop);
 362
 363       decide_peel_once_rolling (loop, flags);
 364       if (loop->lpt_decision.decision == LPT_NONE)
 365         decide_peel_completely (loop, flags);
 366
 367       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 368         {
 369           report_unroll_peel (loop, locus);
 370           peel_loop_completely (loop);
 371           changed = true;
 372         }
 373     }
 374
 375     if (changed)
 376       {
 377         calculate_dominance_info (CDI_DOMINATORS);
 378         fix_loop_structure (NULL);
 379       }
 380 }
 381
 382 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 383 static void
 384 decide_unrolling_and_peeling (int flags)
 385 {
 386   struct loop *loop;
 387
 388   /* Scan the loops, inner ones first.  */
 389   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 390     {
 391       loop->lpt_decision.decision = LPT_NONE;
 392       location_t locus = get_loop_location (loop);
 393
 394       if (dump_enabled_p ())
 395         dump_printf_loc (TDF_RTL, locus,
 396                          ";; *** Considering loop %d at BB %d for "
 397                          "unrolling and peeling ***\n",
 398                          loop->num, loop->header->index);
 399
 400       /* Do not peel cold areas.  */
 401       if (optimize_loop_for_size_p (loop))
 402         {
 403           if (dump_file)
 404             fprintf (dump_file, ";; Not considering loop, cold area\n");
 405           continue;
 406         }
 407
 408       /* Can the loop be manipulated?  */
 409       if (!can_duplicate_loop_p (loop))
 410         {
 411           if (dump_file)
 412             fprintf (dump_file,
 413                      ";; Not considering loop, cannot duplicate\n");
 414           continue;
 415         }
 416
 417       /* Skip non-innermost loops.  */
 418       if (loop->inner)
 419         {
 420           if (dump_file)
 421             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 422           continue;
 423         }
 424
 425       loop->ninsns = num_loop_insns (loop);
 426       loop->av_ninsns = average_num_loop_insns (loop);
 427
 428       /* Try transformations one by one in decreasing order of
 429          priority.  */
 430
 431       decide_unroll_constant_iterations (loop, flags);
 432       if (loop->lpt_decision.decision == LPT_NONE)
 433         decide_unroll_runtime_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_stupid (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_peel_simple (loop, flags);
 438
 439       report_unroll_peel (loop, locus);
 440     }
 441 }
 442
 443 /* Decide whether the LOOP is once rolling and suitable for complete
 444    peeling.  */
 445 static void
 446 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 447 {
 448   struct niter_desc *desc;
 449
 450   if (dump_file)
 451     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 452
 453   /* Is the loop small enough?  */
 454   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 455     {
 456       if (dump_file)
 457         fprintf (dump_file, ";; Not considering loop, is too big\n");
 458       return;
 459     }
 460
 461   /* Check for simple loops.  */
 462   desc = get_simple_loop_desc (loop);
 463
 464   /* Check number of iterations.  */
 465   if (!desc->simple_p
 466       || desc->assumptions
 467       || desc->infinite
 468       || !desc->const_iter
 469       || (desc->niter != 0
 470           && get_max_loop_iterations_int (loop) != 0))
 471     {
 472       if (dump_file)
 473         fprintf (dump_file,
 474                  ";; Unable to prove that the loop rolls exactly once\n");
 475       return;
 476     }
 477
 478   /* Success.  */
 479   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 480 }
 481
 482 /* Decide whether the LOOP is suitable for complete peeling.  */
 483 static void
 484 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 485 {
 486   unsigned npeel;
 487   struct niter_desc *desc;
 488
 489   if (dump_file)
 490     fprintf (dump_file, "\n;; Considering peeling completely\n");
 491
 492   /* Skip non-innermost loops.  */
 493   if (loop->inner)
 494     {
 495       if (dump_file)
 496         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 497       return;
 498     }
 499
 500   /* Do not peel cold areas.  */
 501   if (optimize_loop_for_size_p (loop))
 502     {
 503       if (dump_file)
 504         fprintf (dump_file, ";; Not considering loop, cold area\n");
 505       return;
 506     }
 507
 508   /* Can the loop be manipulated?  */
 509   if (!can_duplicate_loop_p (loop))
 510     {
 511       if (dump_file)
 512         fprintf (dump_file,
 513                  ";; Not considering loop, cannot duplicate\n");
 514       return;
 515     }
 516
 517   /* npeel = number of iterations to peel.  */
 518   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 519   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 520     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 521
 522   /* Is the loop small enough?  */
 523   if (!npeel)
 524     {
 525       if (dump_file)
 526         fprintf (dump_file, ";; Not considering loop, is too big\n");
 527       return;
 528     }
 529
 530   /* Check for simple loops.  */
 531   desc = get_simple_loop_desc (loop);
 532
 533   /* Check number of iterations.  */
 534   if (!desc->simple_p
 535       || desc->assumptions
 536       || !desc->const_iter
 537       || desc->infinite)
 538     {
 539       if (dump_file)
 540         fprintf (dump_file,
 541                  ";; Unable to prove that the loop iterates constant times\n");
 542       return;
 543     }
 544
 545   if (desc->niter > npeel - 1)
 546     {
 547       if (dump_file)
 548         {
 549           fprintf (dump_file,
 550                    ";; Not peeling loop completely, rolls too much (");
 551           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 552           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 553         }
 554       return;
 555     }
 556
 557   /* Success.  */
 558   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 559 }
 560
 561 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 562    completely.  The transformation done:
 563
 564    for (i = 0; i < 4; i++)
 565      body;
 566
 567    ==>
 568
 569    i = 0;
 570    body; i++;
 571    body; i++;
 572    body; i++;
 573    body; i++;
 574    */
 575 static void
 576 peel_loop_completely (struct loop *loop)
 577 {
 578   sbitmap wont_exit;
 579   unsigned HOST_WIDE_INT npeel;
 580   unsigned i;
 581   edge ein;
 582   struct niter_desc *desc = get_simple_loop_desc (loop);
 583   struct opt_info *opt_info = NULL;
 584
 585   npeel = desc->niter;
 586
 587   if (npeel)
 588     {
 589       bool ok;
 590
 591       wont_exit = sbitmap_alloc (npeel + 1);
 592       bitmap_ones (wont_exit);
 593       bitmap_clear_bit (wont_exit, 0);
 594       if (desc->noloop_assumptions)
 595         bitmap_clear_bit (wont_exit, 1);
 596
 597       auto_vec<edge> remove_edges;
 598       if (flag_split_ivs_in_unroller)
 599         opt_info = analyze_insns_in_loop (loop);
 600
 601       opt_info_start_duplication (opt_info);
 602       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 603                                           npeel,
 604                                           wont_exit, desc->out_edge,
 605                                           &remove_edges,
 606                                           DLTHE_FLAG_UPDATE_FREQ
 607                                           | DLTHE_FLAG_COMPLETTE_PEEL
 608                                           | (opt_info
 609                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 610       gcc_assert (ok);
 611
 612       free (wont_exit);
 613
 614       if (opt_info)
 615         {
 616           apply_opt_in_copies (opt_info, npeel, false, true);
 617           free_opt_info (opt_info);
 618         }
 619
 620       /* Remove the exit edges.  */
 621       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 622         remove_path (ein);
 623     }
 624
 625   ein = desc->in_edge;
 626   free_simple_loop_desc (loop);
 627
 628   /* Now remove the unreachable part of the last iteration and cancel
 629      the loop.  */
 630   remove_path (ein);
 631
 632   if (dump_file)
 633     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 634 }
 635
 636 /* Decide whether to unroll LOOP iterating constant number of times
 637    and how much.  */
 638
 639 static void
 640 decide_unroll_constant_iterations (struct loop *loop, int flags)
 641 {
 642   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 643   struct niter_desc *desc;
 644   widest_int iterations;
 645
 646   if (!(flags & UAP_UNROLL))
 647     {
 648       /* We were not asked to, just return back silently.  */
 649       return;
 650     }
 651
 652   if (dump_file)
 653     fprintf (dump_file,
 654              "\n;; Considering unrolling loop with constant "
 655              "number of iterations\n");
 656
 657   /* nunroll = total number of copies of the original loop body in
 658      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 659   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 660   nunroll_by_av
 661     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 662   if (nunroll > nunroll_by_av)
 663     nunroll = nunroll_by_av;
 664   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 665     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 666
 667   if (targetm.loop_unroll_adjust)
 668     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 669
 670   /* Skip big loops.  */
 671   if (nunroll <= 1)
 672     {
 673       if (dump_file)
 674         fprintf (dump_file, ";; Not considering loop, is too big\n");
 675       return;
 676     }
 677
 678   /* Check for simple loops.  */
 679   desc = get_simple_loop_desc (loop);
 680
 681   /* Check number of iterations.  */
 682   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 683     {
 684       if (dump_file)
 685         fprintf (dump_file,
 686                  ";; Unable to prove that the loop iterates constant times\n");
 687       return;
 688     }
 689
 690   /* Check whether the loop rolls enough to consider.
 691      Consult also loop bounds and profile; in the case the loop has more
 692      than one exit it may well loop less than determined maximal number
 693      of iterations.  */
 694   if (desc->niter < 2 * nunroll
 695       || ((get_estimated_loop_iterations (loop, &iterations)
 696            || get_max_loop_iterations (loop, &iterations))
 697           && wi::ltu_p (iterations, 2 * nunroll)))
 698     {
 699       if (dump_file)
 700         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 701       return;
 702     }
 703
 704   /* Success; now compute number of iterations to unroll.  We alter
 705      nunroll so that as few as possible copies of loop body are
 706      necessary, while still not decreasing the number of unrollings
 707      too much (at most by 1).  */
 708   best_copies = 2 * nunroll + 10;
 709
 710   i = 2 * nunroll + 2;
 711   if (i - 1 >= desc->niter)
 712     i = desc->niter - 2;
 713
 714   for (; i >= nunroll - 1; i--)
 715     {
 716       unsigned exit_mod = desc->niter % (i + 1);
 717
 718       if (!loop_exit_at_end_p (loop))
 719         n_copies = exit_mod + i + 1;
 720       else if (exit_mod != (unsigned) i
 721                || desc->noloop_assumptions != NULL_RTX)
 722         n_copies = exit_mod + i + 2;
 723       else
 724         n_copies = i + 1;
 725
 726       if (n_copies < best_copies)
 727         {
 728           best_copies = n_copies;
 729           best_unroll = i;
 730         }
 731     }
 732
 733   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 734   loop->lpt_decision.times = best_unroll;
 735 }
 736
 737 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 738    The transformation does this:
 739
 740    for (i = 0; i < 102; i++)
 741      body;
 742
 743    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 744
 745    i = 0;
 746    body; i++;
 747    body; i++;
 748    while (i < 102)
 749      {
 750        body; i++;
 751        body; i++;
 752        body; i++;
 753        body; i++;
 754      }
 755   */
 756 static void
 757 unroll_loop_constant_iterations (struct loop *loop)
 758 {
 759   unsigned HOST_WIDE_INT niter;
 760   unsigned exit_mod;
 761   sbitmap wont_exit;
 762   unsigned i;
 763   edge e;
 764   unsigned max_unroll = loop->lpt_decision.times;
 765   struct niter_desc *desc = get_simple_loop_desc (loop);
 766   bool exit_at_end = loop_exit_at_end_p (loop);
 767   struct opt_info *opt_info = NULL;
 768   bool ok;
 769
 770   niter = desc->niter;
 771
 772   /* Should not get here (such loop should be peeled instead).  */
 773   gcc_assert (niter > max_unroll + 1);
 774
 775   exit_mod = niter % (max_unroll + 1);
 776
 777   wont_exit = sbitmap_alloc (max_unroll + 1);
 778   bitmap_ones (wont_exit);
 779
 780   auto_vec<edge> remove_edges;
 781   if (flag_split_ivs_in_unroller
 782       || flag_variable_expansion_in_unroller)
 783     opt_info = analyze_insns_in_loop (loop);
 784
 785   if (!exit_at_end)
 786     {
 787       /* The exit is not at the end of the loop; leave exit test
 788          in the first copy, so that the loops that start with test
 789          of exit condition have continuous body after unrolling.  */
 790
 791       if (dump_file)
 792         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 793
 794       /* Peel exit_mod iterations.  */
 795       bitmap_clear_bit (wont_exit, 0);
 796       if (desc->noloop_assumptions)
 797         bitmap_clear_bit (wont_exit, 1);
 798
 799       if (exit_mod)
 800         {
 801           opt_info_start_duplication (opt_info);
 802           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 803                                               exit_mod,
 804                                               wont_exit, desc->out_edge,
 805                                               &remove_edges,
 806                                               DLTHE_FLAG_UPDATE_FREQ
 807                                               | (opt_info && exit_mod > 1
 808                                                  ? DLTHE_RECORD_COPY_NUMBER
 809                                                    : 0));
 810           gcc_assert (ok);
 811
 812           if (opt_info && exit_mod > 1)
 813             apply_opt_in_copies (opt_info, exit_mod, false, false);
 814
 815           desc->noloop_assumptions = NULL_RTX;
 816           desc->niter -= exit_mod;
 817           loop->nb_iterations_upper_bound -= exit_mod;
 818           if (loop->any_estimate
 819               && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
 820             loop->nb_iterations_estimate -= exit_mod;
 821           else
 822             loop->any_estimate = false;
 823         }
 824
 825       bitmap_set_bit (wont_exit, 1);
 826     }
 827   else
 828     {
 829       /* Leave exit test in last copy, for the same reason as above if
 830          the loop tests the condition at the end of loop body.  */
 831
 832       if (dump_file)
 833         fprintf (dump_file, ";; Condition at end of loop.\n");
 834
 835       /* We know that niter >= max_unroll + 2; so we do not need to care of
 836          case when we would exit before reaching the loop.  So just peel
 837          exit_mod + 1 iterations.  */
 838       if (exit_mod != max_unroll
 839           || desc->noloop_assumptions)
 840         {
 841           bitmap_clear_bit (wont_exit, 0);
 842           if (desc->noloop_assumptions)
 843             bitmap_clear_bit (wont_exit, 1);
 844
 845           opt_info_start_duplication (opt_info);
 846           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 847                                               exit_mod + 1,
 848                                               wont_exit, desc->out_edge,
 849                                               &remove_edges,
 850                                               DLTHE_FLAG_UPDATE_FREQ
 851                                               | (opt_info && exit_mod > 0
 852                                                  ? DLTHE_RECORD_COPY_NUMBER
 853                                                    : 0));
 854           gcc_assert (ok);
 855
 856           if (opt_info && exit_mod > 0)
 857             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 858
 859           desc->niter -= exit_mod + 1;
 860           loop->nb_iterations_upper_bound -= exit_mod + 1;
 861           if (loop->any_estimate
 862               && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
 863             loop->nb_iterations_estimate -= exit_mod + 1;
 864           else
 865             loop->any_estimate = false;
 866           desc->noloop_assumptions = NULL_RTX;
 867
 868           bitmap_set_bit (wont_exit, 0);
 869           bitmap_set_bit (wont_exit, 1);
 870         }
 871
 872       bitmap_clear_bit (wont_exit, max_unroll);
 873     }
 874
 875   /* Now unroll the loop.  */
 876
 877   opt_info_start_duplication (opt_info);
 878   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 879                                       max_unroll,
 880                                       wont_exit, desc->out_edge,
 881                                       &remove_edges,
 882                                       DLTHE_FLAG_UPDATE_FREQ
 883                                       | (opt_info
 884                                          ? DLTHE_RECORD_COPY_NUMBER
 885                                            : 0));
 886   gcc_assert (ok);
 887
 888   if (opt_info)
 889     {
 890       apply_opt_in_copies (opt_info, max_unroll, true, true);
 891       free_opt_info (opt_info);
 892     }
 893
 894   free (wont_exit);
 895
 896   if (exit_at_end)
 897     {
 898       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 899       /* Find a new in and out edge; they are in the last copy we have made.  */
 900
 901       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 902         {
 903           desc->out_edge = EDGE_SUCC (exit_block, 0);
 904           desc->in_edge = EDGE_SUCC (exit_block, 1);
 905         }
 906       else
 907         {
 908           desc->out_edge = EDGE_SUCC (exit_block, 1);
 909           desc->in_edge = EDGE_SUCC (exit_block, 0);
 910         }
 911     }
 912
 913   desc->niter /= max_unroll + 1;
 914   loop->nb_iterations_upper_bound
 915     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
 916   if (loop->any_estimate)
 917     loop->nb_iterations_estimate
 918       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
 919   desc->niter_expr = GEN_INT (desc->niter);
 920
 921   /* Remove the edges.  */
 922   FOR_EACH_VEC_ELT (remove_edges, i, e)
 923     remove_path (e);
 924
 925   if (dump_file)
 926     fprintf (dump_file,
 927              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 928              max_unroll, num_loop_insns (loop));
 929 }
 930
 931 /* Decide whether to unroll LOOP iterating runtime computable number of times
 932    and how much.  */
 933 static void
 934 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 935 {
 936   unsigned nunroll, nunroll_by_av, i;
 937   struct niter_desc *desc;
 938   widest_int iterations;
 939
 940   if (!(flags & UAP_UNROLL))
 941     {
 942       /* We were not asked to, just return back silently.  */
 943       return;
 944     }
 945
 946   if (dump_file)
 947     fprintf (dump_file,
 948              "\n;; Considering unrolling loop with runtime "
 949              "computable number of iterations\n");
 950
 951   /* nunroll = total number of copies of the original loop body in
 952      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 953   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 954   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 955   if (nunroll > nunroll_by_av)
 956     nunroll = nunroll_by_av;
 957   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 958     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 959
 960   if (targetm.loop_unroll_adjust)
 961     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 962
 963   /* Skip big loops.  */
 964   if (nunroll <= 1)
 965     {
 966       if (dump_file)
 967         fprintf (dump_file, ";; Not considering loop, is too big\n");
 968       return;
 969     }
 970
 971   /* Check for simple loops.  */
 972   desc = get_simple_loop_desc (loop);
 973
 974   /* Check simpleness.  */
 975   if (!desc->simple_p || desc->assumptions)
 976     {
 977       if (dump_file)
 978         fprintf (dump_file,
 979                  ";; Unable to prove that the number of iterations "
 980                  "can be counted in runtime\n");
 981       return;
 982     }
 983
 984   if (desc->const_iter)
 985     {
 986       if (dump_file)
 987         fprintf (dump_file, ";; Loop iterates constant times\n");
 988       return;
 989     }
 990
 991   /* Check whether the loop rolls.  */
 992   if ((get_estimated_loop_iterations (loop, &iterations)
 993        || get_max_loop_iterations (loop, &iterations))
 994       && wi::ltu_p (iterations, 2 * nunroll))
 995     {
 996       if (dump_file)
 997         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 998       return;
 999     }
1000
1001   /* Success; now force nunroll to be power of 2, as we are unable to
1002      cope with overflows in computation of number of iterations.  */
1003   for (i = 1; 2 * i <= nunroll; i *= 2)
1004     continue;
1005
1006   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1007   loop->lpt_decision.times = i - 1;
1008 }
1009
1010 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1011    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1012    and NULL is returned instead.  */
1013
1014 basic_block
1015 split_edge_and_insert (edge e, rtx insns)
1016 {
1017   basic_block bb;
1018
1019   if (!insns)
1020     return NULL;
1021   bb = split_edge (e);
1022   emit_insn_after (insns, BB_END (bb));
1023
1024   /* ??? We used to assume that INSNS can contain control flow insns, and
1025      that we had to try to find sub basic blocks in BB to maintain a valid
1026      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1027      and call break_superblocks when going out of cfglayout mode.  But it
1028      turns out that this never happens; and that if it does ever happen,
1029      the verify_flow_info at the end of the RTL loop passes would fail.
1030
1031      There are two reasons why we expected we could have control flow insns
1032      in INSNS.  The first is when a comparison has to be done in parts, and
1033      the second is when the number of iterations is computed for loops with
1034      the number of iterations known at runtime.  In both cases, test cases
1035      to get control flow in INSNS appear to be impossible to construct:
1036
1037       * If do_compare_rtx_and_jump needs several branches to do comparison
1038         in a mode that needs comparison by parts, we cannot analyze the
1039         number of iterations of the loop, and we never get to unrolling it.
1040
1041       * The code in expand_divmod that was suspected to cause creation of
1042         branching code seems to be only accessed for signed division.  The
1043         divisions used by # of iterations analysis are always unsigned.
1044         Problems might arise on architectures that emits branching code
1045         for some operations that may appear in the unroller (especially
1046         for division), but we have no such architectures.
1047
1048      Considering all this, it was decided that we should for now assume
1049      that INSNS can in theory contain control flow insns, but in practice
1050      it never does.  So we don't handle the theoretical case, and should
1051      a real failure ever show up, we have a pretty good clue for how to
1052      fix it.  */
1053
1054   return bb;
1055 }
1056
1057 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1058    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
1059    in order to create a jump.  */
1060
1061 static rtx
1062 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1063                       rtx cinsn)
1064 {
1065   rtx seq, jump, cond;
1066   enum machine_mode mode;
1067
1068   mode = GET_MODE (op0);
1069   if (mode == VOIDmode)
1070     mode = GET_MODE (op1);
1071
1072   start_sequence ();
1073   if (GET_MODE_CLASS (mode) == MODE_CC)
1074     {
1075       /* A hack -- there seems to be no easy generic way how to make a
1076          conditional jump from a ccmode comparison.  */
1077       gcc_assert (cinsn);
1078       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1079       gcc_assert (GET_CODE (cond) == comp);
1080       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1081       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1082       emit_jump_insn (copy_insn (PATTERN (cinsn)));
1083       jump = get_last_insn ();
1084       gcc_assert (JUMP_P (jump));
1085       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1086       LABEL_NUSES (JUMP_LABEL (jump))++;
1087       redirect_jump (jump, label, 0);
1088     }
1089   else
1090     {
1091       gcc_assert (!cinsn);
1092
1093       op0 = force_operand (op0, NULL_RTX);
1094       op1 = force_operand (op1, NULL_RTX);
1095       do_compare_rtx_and_jump (op0, op1, comp, 0,
1096                                mode, NULL_RTX, NULL_RTX, label, -1);
1097       jump = get_last_insn ();
1098       gcc_assert (JUMP_P (jump));
1099       JUMP_LABEL (jump) = label;
1100       LABEL_NUSES (label)++;
1101     }
1102   add_int_reg_note (jump, REG_BR_PROB, prob);
1103
1104   seq = get_insns ();
1105   end_sequence ();
1106
1107   return seq;
1108 }
1109
1110 /* Unroll LOOP for which we are able to count number of iterations in runtime
1111    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1112    extra care for case n < 0):
1113
1114    for (i = 0; i < n; i++)
1115      body;
1116
1117    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1118
1119    i = 0;
1120    mod = n % 4;
1121
1122    switch (mod)
1123      {
1124        case 3:
1125          body; i++;
1126        case 2:
1127          body; i++;
1128        case 1:
1129          body; i++;
1130        case 0: ;
1131      }
1132
1133    while (i < n)
1134      {
1135        body; i++;
1136        body; i++;
1137        body; i++;
1138        body; i++;
1139      }
1140    */
1141 static void
1142 unroll_loop_runtime_iterations (struct loop *loop)
1143 {
1144   rtx old_niter, niter, init_code, branch_code, tmp;
1145   unsigned i, j, p;
1146   basic_block preheader, *body, swtch, ezc_swtch;
1147   sbitmap wont_exit;
1148   int may_exit_copy;
1149   unsigned n_peel;
1150   edge e;
1151   bool extra_zero_check, last_may_exit;
1152   unsigned max_unroll = loop->lpt_decision.times;
1153   struct niter_desc *desc = get_simple_loop_desc (loop);
1154   bool exit_at_end = loop_exit_at_end_p (loop);
1155   struct opt_info *opt_info = NULL;
1156   bool ok;
1157
1158   if (flag_split_ivs_in_unroller
1159       || flag_variable_expansion_in_unroller)
1160     opt_info = analyze_insns_in_loop (loop);
1161
1162   /* Remember blocks whose dominators will have to be updated.  */
1163   auto_vec<basic_block> dom_bbs;
1164
1165   body = get_loop_body (loop);
1166   for (i = 0; i < loop->num_nodes; i++)
1167     {
1168       vec<basic_block> ldom;
1169       basic_block bb;
1170
1171       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1172       FOR_EACH_VEC_ELT (ldom, j, bb)
1173         if (!flow_bb_inside_loop_p (loop, bb))
1174           dom_bbs.safe_push (bb);
1175
1176       ldom.release ();
1177     }
1178   free (body);
1179
1180   if (!exit_at_end)
1181     {
1182       /* Leave exit in first copy (for explanation why see comment in
1183          unroll_loop_constant_iterations).  */
1184       may_exit_copy = 0;
1185       n_peel = max_unroll - 1;
1186       extra_zero_check = true;
1187       last_may_exit = false;
1188     }
1189   else
1190     {
1191       /* Leave exit in last copy (for explanation why see comment in
1192          unroll_loop_constant_iterations).  */
1193       may_exit_copy = max_unroll;
1194       n_peel = max_unroll;
1195       extra_zero_check = false;
1196       last_may_exit = true;
1197     }
1198
1199   /* Get expression for number of iterations.  */
1200   start_sequence ();
1201   old_niter = niter = gen_reg_rtx (desc->mode);
1202   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1203   if (tmp != niter)
1204     emit_move_insn (niter, tmp);
1205
1206   /* Count modulo by ANDing it with max_unroll; we use the fact that
1207      the number of unrollings is a power of two, and thus this is correct
1208      even if there is overflow in the computation.  */
1209   niter = expand_simple_binop (desc->mode, AND,
1210                                niter, gen_int_mode (max_unroll, desc->mode),
1211                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1212
1213   init_code = get_insns ();
1214   end_sequence ();
1215   unshare_all_rtl_in_chain (init_code);
1216
1217   /* Precondition the loop.  */
1218   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1219
1220   auto_vec<edge> remove_edges;
1221
1222   wont_exit = sbitmap_alloc (max_unroll + 2);
1223
1224   /* Peel the first copy of loop body (almost always we must leave exit test
1225      here; the only exception is when we have extra zero check and the number
1226      of iterations is reliable.  Also record the place of (possible) extra
1227      zero check.  */
1228   bitmap_clear (wont_exit);
1229   if (extra_zero_check
1230       && !desc->noloop_assumptions)
1231     bitmap_set_bit (wont_exit, 1);
1232   ezc_swtch = loop_preheader_edge (loop)->src;
1233   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1234                                       1, wont_exit, desc->out_edge,
1235                                       &remove_edges,
1236                                       DLTHE_FLAG_UPDATE_FREQ);
1237   gcc_assert (ok);
1238
1239   /* Record the place where switch will be built for preconditioning.  */
1240   swtch = split_edge (loop_preheader_edge (loop));
1241
1242   for (i = 0; i < n_peel; i++)
1243     {
1244       /* Peel the copy.  */
1245       bitmap_clear (wont_exit);
1246       if (i != n_peel - 1 || !last_may_exit)
1247         bitmap_set_bit (wont_exit, 1);
1248       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1249                                           1, wont_exit, desc->out_edge,
1250                                           &remove_edges,
1251                                           DLTHE_FLAG_UPDATE_FREQ);
1252       gcc_assert (ok);
1253
1254       /* Create item for switch.  */
1255       j = n_peel - i - (extra_zero_check ? 0 : 1);
1256       p = REG_BR_PROB_BASE / (i + 2);
1257
1258       preheader = split_edge (loop_preheader_edge (loop));
1259       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1260                                           block_label (preheader), p,
1261                                           NULL_RTX);
1262
1263       /* We rely on the fact that the compare and jump cannot be optimized out,
1264          and hence the cfg we create is correct.  */
1265       gcc_assert (branch_code != NULL_RTX);
1266
1267       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1268       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1269       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1270       e = make_edge (swtch, preheader,
1271                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1272       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1273       e->probability = p;
1274     }
1275
1276   if (extra_zero_check)
1277     {
1278       /* Add branch for zero iterations.  */
1279       p = REG_BR_PROB_BASE / (max_unroll + 1);
1280       swtch = ezc_swtch;
1281       preheader = split_edge (loop_preheader_edge (loop));
1282       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1283                                           block_label (preheader), p,
1284                                           NULL_RTX);
1285       gcc_assert (branch_code != NULL_RTX);
1286
1287       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1288       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1289       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1290       e = make_edge (swtch, preheader,
1291                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1292       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1293       e->probability = p;
1294     }
1295
1296   /* Recount dominators for outer blocks.  */
1297   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1298
1299   /* And unroll loop.  */
1300
1301   bitmap_ones (wont_exit);
1302   bitmap_clear_bit (wont_exit, may_exit_copy);
1303   opt_info_start_duplication (opt_info);
1304
1305   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1306                                       max_unroll,
1307                                       wont_exit, desc->out_edge,
1308                                       &remove_edges,
1309                                       DLTHE_FLAG_UPDATE_FREQ
1310                                       | (opt_info
1311                                          ? DLTHE_RECORD_COPY_NUMBER
1312                                            : 0));
1313   gcc_assert (ok);
1314
1315   if (opt_info)
1316     {
1317       apply_opt_in_copies (opt_info, max_unroll, true, true);
1318       free_opt_info (opt_info);
1319     }
1320
1321   free (wont_exit);
1322
1323   if (exit_at_end)
1324     {
1325       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1326       /* Find a new in and out edge; they are in the last copy we have
1327          made.  */
1328
1329       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1330         {
1331           desc->out_edge = EDGE_SUCC (exit_block, 0);
1332           desc->in_edge = EDGE_SUCC (exit_block, 1);
1333         }
1334       else
1335         {
1336           desc->out_edge = EDGE_SUCC (exit_block, 1);
1337           desc->in_edge = EDGE_SUCC (exit_block, 0);
1338         }
1339     }
1340
1341   /* Remove the edges.  */
1342   FOR_EACH_VEC_ELT (remove_edges, i, e)
1343     remove_path (e);
1344
1345   /* We must be careful when updating the number of iterations due to
1346      preconditioning and the fact that the value must be valid at entry
1347      of the loop.  After passing through the above code, we see that
1348      the correct new number of iterations is this:  */
1349   gcc_assert (!desc->const_iter);
1350   desc->niter_expr =
1351     simplify_gen_binary (UDIV, desc->mode, old_niter,
1352                          gen_int_mode (max_unroll + 1, desc->mode));
1353   loop->nb_iterations_upper_bound
1354     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1355   if (loop->any_estimate)
1356     loop->nb_iterations_estimate
1357       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1358   if (exit_at_end)
1359     {
1360       desc->niter_expr =
1361         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1362       desc->noloop_assumptions = NULL_RTX;
1363       --loop->nb_iterations_upper_bound;
1364       if (loop->any_estimate
1365           && loop->nb_iterations_estimate != 0)
1366         --loop->nb_iterations_estimate;
1367       else
1368         loop->any_estimate = false;
1369     }
1370
1371   if (dump_file)
1372     fprintf (dump_file,
1373              ";; Unrolled loop %d times, counting # of iterations "
1374              "in runtime, %i insns\n",
1375              max_unroll, num_loop_insns (loop));
1376 }
1377
1378 /* Decide whether to simply peel LOOP and how much.  */
1379 static void
1380 decide_peel_simple (struct loop *loop, int flags)
1381 {
1382   unsigned npeel;
1383   widest_int iterations;
1384
1385   if (!(flags & UAP_PEEL))
1386     {
1387       /* We were not asked to, just return back silently.  */
1388       return;
1389     }
1390
1391   if (dump_file)
1392     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1393
1394   /* npeel = number of iterations to peel.  */
1395   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1396   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1397     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1398
1399   /* Skip big loops.  */
1400   if (!npeel)
1401     {
1402       if (dump_file)
1403         fprintf (dump_file, ";; Not considering loop, is too big\n");
1404       return;
1405     }
1406
1407   /* Do not simply peel loops with branches inside -- it increases number
1408      of mispredicts.
1409      Exception is when we do have profile and we however have good chance
1410      to peel proper number of iterations loop will iterate in practice.
1411      TODO: this heuristic needs tunning; while for complette unrolling
1412      the branch inside loop mostly eliminates any improvements, for
1413      peeling it is not the case.  Also a function call inside loop is
1414      also branch from branch prediction POV (and probably better reason
1415      to not unroll/peel).  */
1416   if (num_loop_branches (loop) > 1
1417       && profile_status_for_fn (cfun) != PROFILE_READ)
1418     {
1419       if (dump_file)
1420         fprintf (dump_file, ";; Not peeling, contains branches\n");
1421       return;
1422     }
1423
1424   /* If we have realistic estimate on number of iterations, use it.  */
1425   if (get_estimated_loop_iterations (loop, &iterations))
1426     {
1427       if (wi::leu_p (npeel, iterations))
1428         {
1429           if (dump_file)
1430             {
1431               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1432               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1433                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1434               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1435                        npeel);
1436             }
1437           return;
1438         }
1439       npeel = iterations.to_shwi () + 1;
1440     }
1441   /* If we have small enough bound on iterations, we can still peel (completely
1442      unroll).  */
1443   else if (get_max_loop_iterations (loop, &iterations)
1444            && wi::ltu_p (iterations, npeel))
1445     npeel = iterations.to_shwi () + 1;
1446   else
1447     {
1448       /* For now we have no good heuristics to decide whether loop peeling
1449          will be effective, so disable it.  */
1450       if (dump_file)
1451         fprintf (dump_file,
1452                  ";; Not peeling loop, no evidence it will be profitable\n");
1453       return;
1454     }
1455
1456   /* Success.  */
1457   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1458   loop->lpt_decision.times = npeel;
1459 }
1460
1461 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1462
1463    while (cond)
1464      body;
1465
1466    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1467
1468    if (!cond) goto end;
1469    body;
1470    if (!cond) goto end;
1471    body;
1472    if (!cond) goto end;
1473    body;
1474    while (cond)
1475      body;
1476    end: ;
1477    */
1478 static void
1479 peel_loop_simple (struct loop *loop)
1480 {
1481   sbitmap wont_exit;
1482   unsigned npeel = loop->lpt_decision.times;
1483   struct niter_desc *desc = get_simple_loop_desc (loop);
1484   struct opt_info *opt_info = NULL;
1485   bool ok;
1486
1487   if (flag_split_ivs_in_unroller && npeel > 1)
1488     opt_info = analyze_insns_in_loop (loop);
1489
1490   wont_exit = sbitmap_alloc (npeel + 1);
1491   bitmap_clear (wont_exit);
1492
1493   opt_info_start_duplication (opt_info);
1494
1495   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1496                                       npeel, wont_exit, NULL,
1497                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1498                                       | (opt_info
1499                                          ? DLTHE_RECORD_COPY_NUMBER
1500                                            : 0));
1501   gcc_assert (ok);
1502
1503   free (wont_exit);
1504
1505   if (opt_info)
1506     {
1507       apply_opt_in_copies (opt_info, npeel, false, false);
1508       free_opt_info (opt_info);
1509     }
1510
1511   if (desc->simple_p)
1512     {
1513       if (desc->const_iter)
1514         {
1515           desc->niter -= npeel;
1516           desc->niter_expr = GEN_INT (desc->niter);
1517           desc->noloop_assumptions = NULL_RTX;
1518         }
1519       else
1520         {
1521           /* We cannot just update niter_expr, as its value might be clobbered
1522              inside loop.  We could handle this by counting the number into
1523              temporary just like we do in runtime unrolling, but it does not
1524              seem worthwhile.  */
1525           free_simple_loop_desc (loop);
1526         }
1527     }
1528   if (dump_file)
1529     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1530 }
1531
1532 /* Decide whether to unroll LOOP stupidly and how much.  */
1533 static void
1534 decide_unroll_stupid (struct loop *loop, int flags)
1535 {
1536   unsigned nunroll, nunroll_by_av, i;
1537   struct niter_desc *desc;
1538   widest_int iterations;
1539
1540   if (!(flags & UAP_UNROLL_ALL))
1541     {
1542       /* We were not asked to, just return back silently.  */
1543       return;
1544     }
1545
1546   if (dump_file)
1547     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1548
1549   /* nunroll = total number of copies of the original loop body in
1550      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1551   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1552   nunroll_by_av
1553     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1554   if (nunroll > nunroll_by_av)
1555     nunroll = nunroll_by_av;
1556   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1557     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1558
1559   if (targetm.loop_unroll_adjust)
1560     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1561
1562   /* Skip big loops.  */
1563   if (nunroll <= 1)
1564     {
1565       if (dump_file)
1566         fprintf (dump_file, ";; Not considering loop, is too big\n");
1567       return;
1568     }
1569
1570   /* Check for simple loops.  */
1571   desc = get_simple_loop_desc (loop);
1572
1573   /* Check simpleness.  */
1574   if (desc->simple_p && !desc->assumptions)
1575     {
1576       if (dump_file)
1577         fprintf (dump_file, ";; The loop is simple\n");
1578       return;
1579     }
1580
1581   /* Do not unroll loops with branches inside -- it increases number
1582      of mispredicts.
1583      TODO: this heuristic needs tunning; call inside the loop body
1584      is also relatively good reason to not unroll.  */
1585   if (num_loop_branches (loop) > 1)
1586     {
1587       if (dump_file)
1588         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1589       return;
1590     }
1591
1592   /* Check whether the loop rolls.  */
1593   if ((get_estimated_loop_iterations (loop, &iterations)
1594        || get_max_loop_iterations (loop, &iterations))
1595       && wi::ltu_p (iterations, 2 * nunroll))
1596     {
1597       if (dump_file)
1598         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1599       return;
1600     }
1601
1602   /* Success.  Now force nunroll to be power of 2, as it seems that this
1603      improves results (partially because of better alignments, partially
1604      because of some dark magic).  */
1605   for (i = 1; 2 * i <= nunroll; i *= 2)
1606     continue;
1607
1608   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1609   loop->lpt_decision.times = i - 1;
1610 }
1611
1612 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1613
1614    while (cond)
1615      body;
1616
1617    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1618
1619    while (cond)
1620      {
1621        body;
1622        if (!cond) break;
1623        body;
1624        if (!cond) break;
1625        body;
1626        if (!cond) break;
1627        body;
1628      }
1629    */
1630 static void
1631 unroll_loop_stupid (struct loop *loop)
1632 {
1633   sbitmap wont_exit;
1634   unsigned nunroll = loop->lpt_decision.times;
1635   struct niter_desc *desc = get_simple_loop_desc (loop);
1636   struct opt_info *opt_info = NULL;
1637   bool ok;
1638
1639   if (flag_split_ivs_in_unroller
1640       || flag_variable_expansion_in_unroller)
1641     opt_info = analyze_insns_in_loop (loop);
1642
1643
1644   wont_exit = sbitmap_alloc (nunroll + 1);
1645   bitmap_clear (wont_exit);
1646   opt_info_start_duplication (opt_info);
1647
1648   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1649                                       nunroll, wont_exit,
1650                                       NULL, NULL,
1651                                       DLTHE_FLAG_UPDATE_FREQ
1652                                       | (opt_info
1653                                          ? DLTHE_RECORD_COPY_NUMBER
1654                                            : 0));
1655   gcc_assert (ok);
1656
1657   if (opt_info)
1658     {
1659       apply_opt_in_copies (opt_info, nunroll, true, true);
1660       free_opt_info (opt_info);
1661     }
1662
1663   free (wont_exit);
1664
1665   if (desc->simple_p)
1666     {
1667       /* We indeed may get here provided that there are nontrivial assumptions
1668          for a loop to be really simple.  We could update the counts, but the
1669          problem is that we are unable to decide which exit will be taken
1670          (not really true in case the number of iterations is constant,
1671          but no one will do anything with this information, so we do not
1672          worry about it).  */
1673       desc->simple_p = false;
1674     }
1675
1676   if (dump_file)
1677     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1678              nunroll, num_loop_insns (loop));
1679 }
1680
1681 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1682    Set *DEBUG_USES to the number of debug insns that reference the
1683    variable.  */
1684
1685 bool
1686 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1687                                   int *debug_uses)
1688 {
1689   basic_block *body, bb;
1690   unsigned i;
1691   int count_ref = 0;
1692   rtx insn;
1693
1694   body = get_loop_body (loop);
1695   for (i = 0; i < loop->num_nodes; i++)
1696     {
1697       bb = body[i];
1698
1699       FOR_BB_INSNS (bb, insn)
1700         if (!rtx_referenced_p (reg, insn))
1701           continue;
1702         else if (DEBUG_INSN_P (insn))
1703           ++*debug_uses;
1704         else if (++count_ref > 1)
1705           break;
1706     }
1707   free (body);
1708   return (count_ref  == 1);
1709 }
1710
1711 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1712
1713 static void
1714 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1715 {
1716   basic_block *body, bb;
1717   unsigned i;
1718   rtx insn;
1719
1720   body = get_loop_body (loop);
1721   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1722     {
1723       bb = body[i];
1724
1725       FOR_BB_INSNS (bb, insn)
1726         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1727           continue;
1728         else
1729           {
1730             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1731                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1732             if (!--debug_uses)
1733               break;
1734           }
1735     }
1736   free (body);
1737 }
1738
1739 /* Determine whether INSN contains an accumulator
1740    which can be expanded into separate copies,
1741    one for each copy of the LOOP body.
1742
1743    for (i = 0 ; i < n; i++)
1744      sum += a[i];
1745
1746    ==>
1747
1748    sum += a[i]
1749    ....
1750    i = i+1;
1751    sum1 += a[i]
1752    ....
1753    i = i+1
1754    sum2 += a[i];
1755    ....
1756
1757    Return NULL if INSN contains no opportunity for expansion of accumulator.
1758    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1759    information and return a pointer to it.
1760 */
1761
1762 static struct var_to_expand *
1763 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1764 {
1765   rtx set, dest, src;
1766   struct var_to_expand *ves;
1767   unsigned accum_pos;
1768   enum rtx_code code;
1769   int debug_uses = 0;
1770
1771   set = single_set (insn);
1772   if (!set)
1773     return NULL;
1774
1775   dest = SET_DEST (set);
1776   src = SET_SRC (set);
1777   code = GET_CODE (src);
1778
1779   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1780     return NULL;
1781
1782   if (FLOAT_MODE_P (GET_MODE (dest)))
1783     {
1784       if (!flag_associative_math)
1785         return NULL;
1786       /* In the case of FMA, we're also changing the rounding.  */
1787       if (code == FMA && !flag_unsafe_math_optimizations)
1788         return NULL;
1789     }
1790
1791   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1792      in MD.  But if there is no optab to generate the insn, we can not
1793      perform the variable expansion.  This can happen if an MD provides
1794      an insn but not a named pattern to generate it, for example to avoid
1795      producing code that needs additional mode switches like for x87/mmx.
1796
1797      So we check have_insn_for which looks for an optab for the operation
1798      in SRC.  If it doesn't exist, we can't perform the expansion even
1799      though INSN is valid.  */
1800   if (!have_insn_for (code, GET_MODE (src)))
1801     return NULL;
1802
1803   if (!REG_P (dest)
1804       && !(GET_CODE (dest) == SUBREG
1805            && REG_P (SUBREG_REG (dest))))
1806     return NULL;
1807
1808   /* Find the accumulator use within the operation.  */
1809   if (code == FMA)
1810     {
1811       /* We only support accumulation via FMA in the ADD position.  */
1812       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1813         return NULL;
1814       accum_pos = 2;
1815     }
1816   else if (rtx_equal_p (dest, XEXP (src, 0)))
1817     accum_pos = 0;
1818   else if (rtx_equal_p (dest, XEXP (src, 1)))
1819     {
1820       /* The method of expansion that we are using; which includes the
1821          initialization of the expansions with zero and the summation of
1822          the expansions at the end of the computation will yield wrong
1823          results for (x = something - x) thus avoid using it in that case.  */
1824       if (code == MINUS)
1825         return NULL;
1826       accum_pos = 1;
1827     }
1828   else
1829     return NULL;
1830
1831   /* It must not otherwise be used.  */
1832   if (code == FMA)
1833     {
1834       if (rtx_referenced_p (dest, XEXP (src, 0))
1835           || rtx_referenced_p (dest, XEXP (src, 1)))
1836         return NULL;
1837     }
1838   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1839     return NULL;
1840
1841   /* It must be used in exactly one insn.  */
1842   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1843     return NULL;
1844
1845   if (dump_file)
1846     {
1847       fprintf (dump_file, "\n;; Expanding Accumulator ");
1848       print_rtl (dump_file, dest);
1849       fprintf (dump_file, "\n");
1850     }
1851
1852   if (debug_uses)
1853     /* Instead of resetting the debug insns, we could replace each
1854        debug use in the loop with the sum or product of all expanded
1855        accummulators.  Since we'll only know of all expansions at the
1856        end, we'd have to keep track of which vars_to_expand a debug
1857        insn in the loop references, take note of each copy of the
1858        debug insn during unrolling, and when it's all done, compute
1859        the sum or product of each variable and adjust the original
1860        debug insn and each copy thereof.  What a pain!  */
1861     reset_debug_uses_in_loop (loop, dest, debug_uses);
1862
1863   /* Record the accumulator to expand.  */
1864   ves = XNEW (struct var_to_expand);
1865   ves->insn = insn;
1866   ves->reg = copy_rtx (dest);
1867   ves->var_expansions.create (1);
1868   ves->next = NULL;
1869   ves->op = GET_CODE (src);
1870   ves->expansion_count = 0;
1871   ves->reuse_expansion = 0;
1872   return ves;
1873 }
1874
1875 /* Determine whether there is an induction variable in INSN that
1876    we would like to split during unrolling.
1877
1878    I.e. replace
1879
1880    i = i + 1;
1881    ...
1882    i = i + 1;
1883    ...
1884    i = i + 1;
1885    ...
1886
1887    type chains by
1888
1889    i0 = i + 1
1890    ...
1891    i = i0 + 1
1892    ...
1893    i = i0 + 2
1894    ...
1895
1896    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1897    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1898    pointer to it.  */
1899
1900 static struct iv_to_split *
1901 analyze_iv_to_split_insn (rtx insn)
1902 {
1903   rtx set, dest;
1904   struct rtx_iv iv;
1905   struct iv_to_split *ivts;
1906   bool ok;
1907
1908   /* For now we just split the basic induction variables.  Later this may be
1909      extended for example by selecting also addresses of memory references.  */
1910   set = single_set (insn);
1911   if (!set)
1912     return NULL;
1913
1914   dest = SET_DEST (set);
1915   if (!REG_P (dest))
1916     return NULL;
1917
1918   if (!biv_p (insn, dest))
1919     return NULL;
1920
1921   ok = iv_analyze_result (insn, dest, &iv);
1922
1923   /* This used to be an assert under the assumption that if biv_p returns
1924      true that iv_analyze_result must also return true.  However, that
1925      assumption is not strictly correct as evidenced by pr25569.
1926
1927      Returning NULL when iv_analyze_result returns false is safe and
1928      avoids the problems in pr25569 until the iv_analyze_* routines
1929      can be fixed, which is apparently hard and time consuming
1930      according to their author.  */
1931   if (! ok)
1932     return NULL;
1933
1934   if (iv.step == const0_rtx
1935       || iv.mode != iv.extend_mode)
1936     return NULL;
1937
1938   /* Record the insn to split.  */
1939   ivts = XNEW (struct iv_to_split);
1940   ivts->insn = insn;
1941   ivts->orig_var = dest;
1942   ivts->base_var = NULL_RTX;
1943   ivts->step = iv.step;
1944   ivts->next = NULL;
1945   ivts->n_loc = 1;
1946   ivts->loc[0] = 1;
1947
1948   return ivts;
1949 }
1950
1951 /* Determines which of insns in LOOP can be optimized.
1952    Return a OPT_INFO struct with the relevant hash tables filled
1953    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1954    is undefined for the return value.  */
1955
1956 static struct opt_info *
1957 analyze_insns_in_loop (struct loop *loop)
1958 {
1959   basic_block *body, bb;
1960   unsigned i;
1961   struct opt_info *opt_info = XCNEW (struct opt_info);
1962   rtx insn;
1963   struct iv_to_split *ivts = NULL;
1964   struct var_to_expand *ves = NULL;
1965   iv_to_split **slot1;
1966   var_to_expand **slot2;
1967   vec<edge> edges = get_loop_exit_edges (loop);
1968   edge exit;
1969   bool can_apply = false;
1970
1971   iv_analysis_loop_init (loop);
1972
1973   body = get_loop_body (loop);
1974
1975   if (flag_split_ivs_in_unroller)
1976     {
1977       opt_info->insns_to_split.create (5 * loop->num_nodes);
1978       opt_info->iv_to_split_head = NULL;
1979       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1980     }
1981
1982   /* Record the loop exit bb and loop preheader before the unrolling.  */
1983   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1984
1985   if (edges.length () == 1)
1986     {
1987       exit = edges[0];
1988       if (!(exit->flags & EDGE_COMPLEX))
1989         {
1990           opt_info->loop_exit = split_edge (exit);
1991           can_apply = true;
1992         }
1993     }
1994
1995   if (flag_variable_expansion_in_unroller
1996       && can_apply)
1997     {
1998       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1999       opt_info->var_to_expand_head = NULL;
2000       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
2001     }
2002
2003   for (i = 0; i < loop->num_nodes; i++)
2004     {
2005       bb = body[i];
2006       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2007         continue;
2008
2009       FOR_BB_INSNS (bb, insn)
2010       {
2011         if (!INSN_P (insn))
2012           continue;
2013
2014         if (opt_info->insns_to_split.is_created ())
2015           ivts = analyze_iv_to_split_insn (insn);
2016
2017         if (ivts)
2018           {
2019             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
2020             gcc_assert (*slot1 == NULL);
2021             *slot1 = ivts;
2022             *opt_info->iv_to_split_tail = ivts;
2023             opt_info->iv_to_split_tail = &ivts->next;
2024             continue;
2025           }
2026
2027         if (opt_info->insns_with_var_to_expand.is_created ())
2028           ves = analyze_insn_to_expand_var (loop, insn);
2029
2030         if (ves)
2031           {
2032             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
2033             gcc_assert (*slot2 == NULL);
2034             *slot2 = ves;
2035             *opt_info->var_to_expand_tail = ves;
2036             opt_info->var_to_expand_tail = &ves->next;
2037           }
2038       }
2039     }
2040
2041   edges.release ();
2042   free (body);
2043   return opt_info;
2044 }
2045
2046 /* Called just before loop duplication.  Records start of duplicated area
2047    to OPT_INFO.  */
2048
2049 static void
2050 opt_info_start_duplication (struct opt_info *opt_info)
2051 {
2052   if (opt_info)
2053     opt_info->first_new_block = last_basic_block_for_fn (cfun);
2054 }
2055
2056 /* Determine the number of iterations between initialization of the base
2057    variable and the current copy (N_COPY).  N_COPIES is the total number
2058    of newly created copies.  UNROLLING is true if we are unrolling
2059    (not peeling) the loop.  */
2060
2061 static unsigned
2062 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2063 {
2064   if (unrolling)
2065     {
2066       /* If we are unrolling, initialization is done in the original loop
2067          body (number 0).  */
2068       return n_copy;
2069     }
2070   else
2071     {
2072       /* If we are peeling, the copy in that the initialization occurs has
2073          number 1.  The original loop (number 0) is the last.  */
2074       if (n_copy)
2075         return n_copy - 1;
2076       else
2077         return n_copies;
2078     }
2079 }
2080
2081 /* Locate in EXPR the expression corresponding to the location recorded
2082    in IVTS, and return a pointer to the RTX for this location.  */
2083
2084 static rtx *
2085 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2086 {
2087   unsigned i;
2088   rtx *ret = &expr;
2089
2090   for (i = 0; i < ivts->n_loc; i++)
2091     ret = &XEXP (*ret, ivts->loc[i]);
2092
2093   return ret;
2094 }
2095
2096 /* Allocate basic variable for the induction variable chain.  */
2097
2098 static void
2099 allocate_basic_variable (struct iv_to_split *ivts)
2100 {
2101   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2102
2103   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2104 }
2105
2106 /* Insert initialization of basic variable of IVTS before INSN, taking
2107    the initial value from INSN.  */
2108
2109 static void
2110 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2111 {
2112   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2113   rtx seq;
2114
2115   start_sequence ();
2116   expr = force_operand (expr, ivts->base_var);
2117   if (expr != ivts->base_var)
2118     emit_move_insn (ivts->base_var, expr);
2119   seq = get_insns ();
2120   end_sequence ();
2121
2122   emit_insn_before (seq, insn);
2123 }
2124
2125 /* Replace the use of induction variable described in IVTS in INSN
2126    by base variable + DELTA * step.  */
2127
2128 static void
2129 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2130 {
2131   rtx expr, *loc, seq, incr, var;
2132   enum machine_mode mode = GET_MODE (ivts->base_var);
2133   rtx src, dest, set;
2134
2135   /* Construct base + DELTA * step.  */
2136   if (!delta)
2137     expr = ivts->base_var;
2138   else
2139     {
2140       incr = simplify_gen_binary (MULT, mode,
2141                                   ivts->step, gen_int_mode (delta, mode));
2142       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2143                                   ivts->base_var, incr);
2144     }
2145
2146   /* Figure out where to do the replacement.  */
2147   loc = get_ivts_expr (single_set (insn), ivts);
2148
2149   /* If we can make the replacement right away, we're done.  */
2150   if (validate_change (insn, loc, expr, 0))
2151     return;
2152
2153   /* Otherwise, force EXPR into a register and try again.  */
2154   start_sequence ();
2155   var = gen_reg_rtx (mode);
2156   expr = force_operand (expr, var);
2157   if (expr != var)
2158     emit_move_insn (var, expr);
2159   seq = get_insns ();
2160   end_sequence ();
2161   emit_insn_before (seq, insn);
2162
2163   if (validate_change (insn, loc, var, 0))
2164     return;
2165
2166   /* The last chance.  Try recreating the assignment in insn
2167      completely from scratch.  */
2168   set = single_set (insn);
2169   gcc_assert (set);
2170
2171   start_sequence ();
2172   *loc = var;
2173   src = copy_rtx (SET_SRC (set));
2174   dest = copy_rtx (SET_DEST (set));
2175   src = force_operand (src, dest);
2176   if (src != dest)
2177     emit_move_insn (dest, src);
2178   seq = get_insns ();
2179   end_sequence ();
2180
2181   emit_insn_before (seq, insn);
2182   delete_insn (insn);
2183 }
2184
2185
2186 /* Return one expansion of the accumulator recorded in struct VE.  */
2187
2188 static rtx
2189 get_expansion (struct var_to_expand *ve)
2190 {
2191   rtx reg;
2192
2193   if (ve->reuse_expansion == 0)
2194     reg = ve->reg;
2195   else
2196     reg = ve->var_expansions[ve->reuse_expansion - 1];
2197
2198   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2199     ve->reuse_expansion = 0;
2200   else
2201     ve->reuse_expansion++;
2202
2203   return reg;
2204 }
2205
2206
2207 /* Given INSN replace the uses of the accumulator recorded in VE
2208    with a new register.  */
2209
2210 static void
2211 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2212 {
2213   rtx new_reg, set;
2214   bool really_new_expansion = false;
2215
2216   set = single_set (insn);
2217   gcc_assert (set);
2218
2219   /* Generate a new register only if the expansion limit has not been
2220      reached.  Else reuse an already existing expansion.  */
2221   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2222     {
2223       really_new_expansion = true;
2224       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2225     }
2226   else
2227     new_reg = get_expansion (ve);
2228
2229   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2230   if (apply_change_group ())
2231     if (really_new_expansion)
2232       {
2233         ve->var_expansions.safe_push (new_reg);
2234         ve->expansion_count++;
2235       }
2236 }
2237
2238 /* Initialize the variable expansions in loop preheader.  PLACE is the
2239    loop-preheader basic block where the initialization of the
2240    expansions should take place.  The expansions are initialized with
2241    (-0) when the operation is plus or minus to honor sign zero.  This
2242    way we can prevent cases where the sign of the final result is
2243    effected by the sign of the expansion.  Here is an example to
2244    demonstrate this:
2245
2246    for (i = 0 ; i < n; i++)
2247      sum += something;
2248
2249    ==>
2250
2251    sum += something
2252    ....
2253    i = i+1;
2254    sum1 += something
2255    ....
2256    i = i+1
2257    sum2 += something;
2258    ....
2259
2260    When SUM is initialized with -zero and SOMETHING is also -zero; the
2261    final result of sum should be -zero thus the expansions sum1 and sum2
2262    should be initialized with -zero as well (otherwise we will get +zero
2263    as the final result).  */
2264
2265 static void
2266 insert_var_expansion_initialization (struct var_to_expand *ve,
2267                                      basic_block place)
2268 {
2269   rtx seq, var, zero_init;
2270   unsigned i;
2271   enum machine_mode mode = GET_MODE (ve->reg);
2272   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2273
2274   if (ve->var_expansions.length () == 0)
2275     return;
2276
2277   start_sequence ();
2278   switch (ve->op)
2279     {
2280     case FMA:
2281       /* Note that we only accumulate FMA via the ADD operand.  */
2282     case PLUS:
2283     case MINUS:
2284       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2285         {
2286           if (honor_signed_zero_p)
2287             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2288           else
2289             zero_init = CONST0_RTX (mode);
2290           emit_move_insn (var, zero_init);
2291         }
2292       break;
2293
2294     case MULT:
2295       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2296         {
2297           zero_init = CONST1_RTX (GET_MODE (var));
2298           emit_move_insn (var, zero_init);
2299         }
2300       break;
2301
2302     default:
2303       gcc_unreachable ();
2304     }
2305
2306   seq = get_insns ();
2307   end_sequence ();
2308
2309   emit_insn_after (seq, BB_END (place));
2310 }
2311
2312 /* Combine the variable expansions at the loop exit.  PLACE is the
2313    loop exit basic block where the summation of the expansions should
2314    take place.  */
2315
2316 static void
2317 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2318 {
2319   rtx sum = ve->reg;
2320   rtx expr, seq, var, insn;
2321   unsigned i;
2322
2323   if (ve->var_expansions.length () == 0)
2324     return;
2325
2326   start_sequence ();
2327   switch (ve->op)
2328     {
2329     case FMA:
2330       /* Note that we only accumulate FMA via the ADD operand.  */
2331     case PLUS:
2332     case MINUS:
2333       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2334         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2335       break;
2336
2337     case MULT:
2338       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2339         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2340       break;
2341
2342     default:
2343       gcc_unreachable ();
2344     }
2345
2346   expr = force_operand (sum, ve->reg);
2347   if (expr != ve->reg)
2348     emit_move_insn (ve->reg, expr);
2349   seq = get_insns ();
2350   end_sequence ();
2351
2352   insn = BB_HEAD (place);
2353   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2354     insn = NEXT_INSN (insn);
2355
2356   emit_insn_after (seq, insn);
2357 }
2358
2359 /* Strip away REG_EQUAL notes for IVs we're splitting.
2360
2361    Updating REG_EQUAL notes for IVs we split is tricky: We
2362    cannot tell until after unrolling, DF-rescanning, and liveness
2363    updating, whether an EQ_USE is reached by the split IV while
2364    the IV reg is still live.  See PR55006.
2365
2366    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2367    because RTL loop-iv requires us to defer rescanning insns and
2368    any notes attached to them.  So resort to old techniques...  */
2369
2370 static void
2371 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2372 {
2373   struct iv_to_split *ivts;
2374   rtx note = find_reg_equal_equiv_note (insn);
2375   if (! note)
2376     return;
2377   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2378     if (reg_mentioned_p (ivts->orig_var, note))
2379       {
2380         remove_note (insn, note);
2381         return;
2382       }
2383 }
2384
2385 /* Apply loop optimizations in loop copies using the
2386    data which gathered during the unrolling.  Structure
2387    OPT_INFO record that data.
2388
2389    UNROLLING is true if we unrolled (not peeled) the loop.
2390    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2391    the loop (as it should happen in complete unrolling, but not in ordinary
2392    peeling of the loop).  */
2393
2394 static void
2395 apply_opt_in_copies (struct opt_info *opt_info,
2396                      unsigned n_copies, bool unrolling,
2397                      bool rewrite_original_loop)
2398 {
2399   unsigned i, delta;
2400   basic_block bb, orig_bb;
2401   rtx insn, orig_insn, next;
2402   struct iv_to_split ivts_templ, *ivts;
2403   struct var_to_expand ve_templ, *ves;
2404
2405   /* Sanity check -- we need to put initialization in the original loop
2406      body.  */
2407   gcc_assert (!unrolling || rewrite_original_loop);
2408
2409   /* Allocate the basic variables (i0).  */
2410   if (opt_info->insns_to_split.is_created ())
2411     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2412       allocate_basic_variable (ivts);
2413
2414   for (i = opt_info->first_new_block;
2415        i < (unsigned) last_basic_block_for_fn (cfun);
2416        i++)
2417     {
2418       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2419       orig_bb = get_bb_original (bb);
2420
2421       /* bb->aux holds position in copy sequence initialized by
2422          duplicate_loop_to_header_edge.  */
2423       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2424                                         unrolling);
2425       bb->aux = 0;
2426       orig_insn = BB_HEAD (orig_bb);
2427       FOR_BB_INSNS_SAFE (bb, insn, next)
2428         {
2429           if (!INSN_P (insn)
2430               || (DEBUG_INSN_P (insn)
2431                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2432             continue;
2433
2434           while (!INSN_P (orig_insn)
2435                  || (DEBUG_INSN_P (orig_insn)
2436                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2437                          == LABEL_DECL)))
2438             orig_insn = NEXT_INSN (orig_insn);
2439
2440           ivts_templ.insn = orig_insn;
2441           ve_templ.insn = orig_insn;
2442
2443           /* Apply splitting iv optimization.  */
2444           if (opt_info->insns_to_split.is_created ())
2445             {
2446               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2447
2448               ivts = opt_info->insns_to_split.find (&ivts_templ);
2449
2450               if (ivts)
2451                 {
2452                   gcc_assert (GET_CODE (PATTERN (insn))
2453                               == GET_CODE (PATTERN (orig_insn)));
2454
2455                   if (!delta)
2456                     insert_base_initialization (ivts, insn);
2457                   split_iv (ivts, insn, delta);
2458                 }
2459             }
2460           /* Apply variable expansion optimization.  */
2461           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2462             {
2463               ves = (struct var_to_expand *)
2464                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2465               if (ves)
2466                 {
2467                   gcc_assert (GET_CODE (PATTERN (insn))
2468                               == GET_CODE (PATTERN (orig_insn)));
2469                   expand_var_during_unrolling (ves, insn);
2470                 }
2471             }
2472           orig_insn = NEXT_INSN (orig_insn);
2473         }
2474     }
2475
2476   if (!rewrite_original_loop)
2477     return;
2478
2479   /* Initialize the variable expansions in the loop preheader
2480      and take care of combining them at the loop exit.  */
2481   if (opt_info->insns_with_var_to_expand.is_created ())
2482     {
2483       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2484         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2485       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2486         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2487     }
2488
2489   /* Rewrite also the original loop body.  Find them as originals of the blocks
2490      in the last copied iteration, i.e. those that have
2491      get_bb_copy (get_bb_original (bb)) == bb.  */
2492   for (i = opt_info->first_new_block;
2493        i < (unsigned) last_basic_block_for_fn (cfun);
2494        i++)
2495     {
2496       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2497       orig_bb = get_bb_original (bb);
2498       if (get_bb_copy (orig_bb) != bb)
2499         continue;
2500
2501       delta = determine_split_iv_delta (0, n_copies, unrolling);
2502       for (orig_insn = BB_HEAD (orig_bb);
2503            orig_insn != NEXT_INSN (BB_END (bb));
2504            orig_insn = next)
2505         {
2506           next = NEXT_INSN (orig_insn);
2507
2508           if (!INSN_P (orig_insn))
2509             continue;
2510
2511           ivts_templ.insn = orig_insn;
2512           if (opt_info->insns_to_split.is_created ())
2513             {
2514               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2515
2516               ivts = (struct iv_to_split *)
2517                 opt_info->insns_to_split.find (&ivts_templ);
2518               if (ivts)
2519                 {
2520                   if (!delta)
2521                     insert_base_initialization (ivts, orig_insn);
2522                   split_iv (ivts, orig_insn, delta);
2523                   continue;
2524                 }
2525             }
2526
2527         }
2528     }
2529 }
2530
2531 /* Release OPT_INFO.  */
2532
2533 static void
2534 free_opt_info (struct opt_info *opt_info)
2535 {
2536   if (opt_info->insns_to_split.is_created ())
2537     opt_info->insns_to_split.dispose ();
2538   if (opt_info->insns_with_var_to_expand.is_created ())
2539     {
2540       struct var_to_expand *ves;
2541
2542       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2543         ves->var_expansions.release ();
2544       opt_info->insns_with_var_to_expand.dispose ();
2545     }
2546   free (opt_info);
2547 }