gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hash-table.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Hashtable helper for iv_to_split.  */
 106
 107 struct iv_split_hasher : typed_free_remove <iv_to_split>
 108 {
 109   typedef iv_to_split value_type;
 110   typedef iv_to_split compare_type;
 111   static inline hashval_t hash (const value_type *);
 112   static inline bool equal (const value_type *, const compare_type *);
 113 };
 114
 115
 116 /* A hash function for information about insns to split.  */
 117
 118 inline hashval_t
 119 iv_split_hasher::hash (const value_type *ivts)
 120 {
 121   return (hashval_t) INSN_UID (ivts->insn);
 122 }
 123
 124 /* An equality functions for information about insns to split.  */
 125
 126 inline bool
 127 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 128 {
 129   return i1->insn == i2->insn;
 130 }
 131
 132 /* Hashtable helper for iv_to_split.  */
 133
 134 struct var_expand_hasher : typed_free_remove <var_to_expand>
 135 {
 136   typedef var_to_expand value_type;
 137   typedef var_to_expand compare_type;
 138   static inline hashval_t hash (const value_type *);
 139   static inline bool equal (const value_type *, const compare_type *);
 140 };
 141
 142 /* Return a hash for VES.  */
 143
 144 inline hashval_t
 145 var_expand_hasher::hash (const value_type *ves)
 146 {
 147   return (hashval_t) INSN_UID (ves->insn);
 148 }
 149
 150 /* Return true if I1 and I2 refer to the same instruction.  */
 151
 152 inline bool
 153 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 154 {
 155   return i1->insn == i2->insn;
 156 }
 157
 158 /* Information about optimization applied in
 159    the unrolled loop.  */
 160
 161 struct opt_info
 162 {
 163   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 164                                                   split.  */
 165   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 166   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 167   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 168                                         insns with accumulators to expand.  */
 169   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 170   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 171   unsigned first_new_block;        /* The first basic block that was
 172                                       duplicated.  */
 173   basic_block loop_exit;           /* The loop exit basic block.  */
 174   basic_block loop_preheader;      /* The loop preheader basic block.  */
 175 };
 176
 177 static void decide_unrolling_and_peeling (int);
 178 static void peel_loops_completely (int);
 179 static void decide_peel_simple (struct loop *, int);
 180 static void decide_peel_once_rolling (struct loop *, int);
 181 static void decide_peel_completely (struct loop *, int);
 182 static void decide_unroll_stupid (struct loop *, int);
 183 static void decide_unroll_constant_iterations (struct loop *, int);
 184 static void decide_unroll_runtime_iterations (struct loop *, int);
 185 static void peel_loop_simple (struct loop *);
 186 static void peel_loop_completely (struct loop *);
 187 static void unroll_loop_stupid (struct loop *);
 188 static void unroll_loop_constant_iterations (struct loop *);
 189 static void unroll_loop_runtime_iterations (struct loop *);
 190 static struct opt_info *analyze_insns_in_loop (struct loop *);
 191 static void opt_info_start_duplication (struct opt_info *);
 192 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 193 static void free_opt_info (struct opt_info *);
 194 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 195 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 196 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 197 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 198 static void insert_var_expansion_initialization (struct var_to_expand *,
 199                                                  basic_block);
 200 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 201                                              basic_block);
 202 static rtx get_expansion (struct var_to_expand *);
 203
 204 /* Emit a message summarizing the unroll or peel that will be
 205    performed for LOOP, along with the loop's location LOCUS, if
 206    appropriate given the dump or -fopt-info settings.  */
 207
 208 static void
 209 report_unroll_peel (struct loop *loop, location_t locus)
 210 {
 211   struct niter_desc *desc;
 212   int niters = 0;
 213   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 214
 215   if (loop->lpt_decision.decision == LPT_NONE)
 216     return;
 217
 218   if (!dump_enabled_p ())
 219     return;
 220
 221   /* In the special case where the loop never iterated, emit
 222      a different message so that we don't report an unroll by 0.
 223      This matches the equivalent message emitted during tree unrolling.  */
 224   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 225       && !loop->lpt_decision.times)
 226     {
 227       dump_printf_loc (report_flags, locus,
 228                        "loop turned into non-loop; it never loops.\n");
 229       return;
 230     }
 231
 232   desc = get_simple_loop_desc (loop);
 233
 234   if (desc->const_iter)
 235     niters = desc->niter;
 236   else if (loop->header->count)
 237     niters = expected_loop_iterations (loop);
 238
 239   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 240     dump_printf_loc (report_flags, locus,
 241                      "loop with %d iterations completely unrolled",
 242                      loop->lpt_decision.times + 1);
 243   else
 244     dump_printf_loc (report_flags, locus,
 245                      "loop %s %d times",
 246                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 247                        ? "peeled" : "unrolled"),
 248                      loop->lpt_decision.times);
 249   if (profile_info)
 250     dump_printf (report_flags,
 251                  " (header execution count %d",
 252                  (int)loop->header->count);
 253   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254     dump_printf (report_flags,
 255                  "%s%s iterations %d)",
 256                  profile_info ? ", " : " (",
 257                  desc->const_iter ? "const" : "average",
 258                  niters);
 259   else if (profile_info)
 260     dump_printf (report_flags, ")");
 261
 262   dump_printf (report_flags, "\n");
 263 }
 264
 265 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 266 void
 267 unroll_and_peel_loops (int flags)
 268 {
 269   struct loop *loop;
 270   bool changed = false;
 271   loop_iterator li;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   loop_iterator li;
 348   bool changed = false;
 349
 350   /* Scan the loops, the inner ones first.  */
 351   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 352     {
 353       loop->lpt_decision.decision = LPT_NONE;
 354       location_t locus = get_loop_location (loop);
 355
 356       if (dump_enabled_p ())
 357         dump_printf_loc (TDF_RTL, locus,
 358                          ";; *** Considering loop %d at BB %d for "
 359                          "complete peeling ***\n",
 360                          loop->num, loop->header->index);
 361
 362       loop->ninsns = num_loop_insns (loop);
 363
 364       decide_peel_once_rolling (loop, flags);
 365       if (loop->lpt_decision.decision == LPT_NONE)
 366         decide_peel_completely (loop, flags);
 367
 368       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 369         {
 370           report_unroll_peel (loop, locus);
 371           peel_loop_completely (loop);
 372           changed = true;
 373         }
 374     }
 375
 376     if (changed)
 377       {
 378         calculate_dominance_info (CDI_DOMINATORS);
 379         fix_loop_structure (NULL);
 380       }
 381 }
 382
 383 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 384 static void
 385 decide_unrolling_and_peeling (int flags)
 386 {
 387   struct loop *loop;
 388   loop_iterator li;
 389
 390   /* Scan the loops, inner ones first.  */
 391   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 392     {
 393       loop->lpt_decision.decision = LPT_NONE;
 394       location_t locus = get_loop_location (loop);
 395
 396       if (dump_enabled_p ())
 397         dump_printf_loc (TDF_RTL, locus,
 398                          ";; *** Considering loop %d at BB %d for "
 399                          "unrolling and peeling ***\n",
 400                          loop->num, loop->header->index);
 401
 402       /* Do not peel cold areas.  */
 403       if (optimize_loop_for_size_p (loop))
 404         {
 405           if (dump_file)
 406             fprintf (dump_file, ";; Not considering loop, cold area\n");
 407           continue;
 408         }
 409
 410       /* Can the loop be manipulated?  */
 411       if (!can_duplicate_loop_p (loop))
 412         {
 413           if (dump_file)
 414             fprintf (dump_file,
 415                      ";; Not considering loop, cannot duplicate\n");
 416           continue;
 417         }
 418
 419       /* Skip non-innermost loops.  */
 420       if (loop->inner)
 421         {
 422           if (dump_file)
 423             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 424           continue;
 425         }
 426
 427       loop->ninsns = num_loop_insns (loop);
 428       loop->av_ninsns = average_num_loop_insns (loop);
 429
 430       /* Try transformations one by one in decreasing order of
 431          priority.  */
 432
 433       decide_unroll_constant_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_runtime_iterations (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_unroll_stupid (loop, flags);
 438       if (loop->lpt_decision.decision == LPT_NONE)
 439         decide_peel_simple (loop, flags);
 440
 441       report_unroll_peel (loop, locus);
 442     }
 443 }
 444
 445 /* Decide whether the LOOP is once rolling and suitable for complete
 446    peeling.  */
 447 static void
 448 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 449 {
 450   struct niter_desc *desc;
 451
 452   if (dump_file)
 453     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 454
 455   /* Is the loop small enough?  */
 456   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 457     {
 458       if (dump_file)
 459         fprintf (dump_file, ";; Not considering loop, is too big\n");
 460       return;
 461     }
 462
 463   /* Check for simple loops.  */
 464   desc = get_simple_loop_desc (loop);
 465
 466   /* Check number of iterations.  */
 467   if (!desc->simple_p
 468       || desc->assumptions
 469       || desc->infinite
 470       || !desc->const_iter
 471       || (desc->niter != 0
 472           && max_loop_iterations_int (loop) != 0))
 473     {
 474       if (dump_file)
 475         fprintf (dump_file,
 476                  ";; Unable to prove that the loop rolls exactly once\n");
 477       return;
 478     }
 479
 480   /* Success.  */
 481   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 482 }
 483
 484 /* Decide whether the LOOP is suitable for complete peeling.  */
 485 static void
 486 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 487 {
 488   unsigned npeel;
 489   struct niter_desc *desc;
 490
 491   if (dump_file)
 492     fprintf (dump_file, "\n;; Considering peeling completely\n");
 493
 494   /* Skip non-innermost loops.  */
 495   if (loop->inner)
 496     {
 497       if (dump_file)
 498         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 499       return;
 500     }
 501
 502   /* Do not peel cold areas.  */
 503   if (optimize_loop_for_size_p (loop))
 504     {
 505       if (dump_file)
 506         fprintf (dump_file, ";; Not considering loop, cold area\n");
 507       return;
 508     }
 509
 510   /* Can the loop be manipulated?  */
 511   if (!can_duplicate_loop_p (loop))
 512     {
 513       if (dump_file)
 514         fprintf (dump_file,
 515                  ";; Not considering loop, cannot duplicate\n");
 516       return;
 517     }
 518
 519   /* npeel = number of iterations to peel.  */
 520   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 521   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 522     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 523
 524   /* Is the loop small enough?  */
 525   if (!npeel)
 526     {
 527       if (dump_file)
 528         fprintf (dump_file, ";; Not considering loop, is too big\n");
 529       return;
 530     }
 531
 532   /* Check for simple loops.  */
 533   desc = get_simple_loop_desc (loop);
 534
 535   /* Check number of iterations.  */
 536   if (!desc->simple_p
 537       || desc->assumptions
 538       || !desc->const_iter
 539       || desc->infinite)
 540     {
 541       if (dump_file)
 542         fprintf (dump_file,
 543                  ";; Unable to prove that the loop iterates constant times\n");
 544       return;
 545     }
 546
 547   if (desc->niter > npeel - 1)
 548     {
 549       if (dump_file)
 550         {
 551           fprintf (dump_file,
 552                    ";; Not peeling loop completely, rolls too much (");
 553           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 554           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 555         }
 556       return;
 557     }
 558
 559   /* Success.  */
 560   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 561 }
 562
 563 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 564    completely.  The transformation done:
 565
 566    for (i = 0; i < 4; i++)
 567      body;
 568
 569    ==>
 570
 571    i = 0;
 572    body; i++;
 573    body; i++;
 574    body; i++;
 575    body; i++;
 576    */
 577 static void
 578 peel_loop_completely (struct loop *loop)
 579 {
 580   sbitmap wont_exit;
 581   unsigned HOST_WIDE_INT npeel;
 582   unsigned i;
 583   vec<edge> remove_edges;
 584   edge ein;
 585   struct niter_desc *desc = get_simple_loop_desc (loop);
 586   struct opt_info *opt_info = NULL;
 587
 588   npeel = desc->niter;
 589
 590   if (npeel)
 591     {
 592       bool ok;
 593
 594       wont_exit = sbitmap_alloc (npeel + 1);
 595       bitmap_ones (wont_exit);
 596       bitmap_clear_bit (wont_exit, 0);
 597       if (desc->noloop_assumptions)
 598         bitmap_clear_bit (wont_exit, 1);
 599
 600       remove_edges.create (0);
 601
 602       if (flag_split_ivs_in_unroller)
 603         opt_info = analyze_insns_in_loop (loop);
 604
 605       opt_info_start_duplication (opt_info);
 606       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 607                                           npeel,
 608                                           wont_exit, desc->out_edge,
 609                                           &remove_edges,
 610                                           DLTHE_FLAG_UPDATE_FREQ
 611                                           | DLTHE_FLAG_COMPLETTE_PEEL
 612                                           | (opt_info
 613                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 614       gcc_assert (ok);
 615
 616       free (wont_exit);
 617
 618       if (opt_info)
 619         {
 620           apply_opt_in_copies (opt_info, npeel, false, true);
 621           free_opt_info (opt_info);
 622         }
 623
 624       /* Remove the exit edges.  */
 625       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 626         remove_path (ein);
 627       remove_edges.release ();
 628     }
 629
 630   ein = desc->in_edge;
 631   free_simple_loop_desc (loop);
 632
 633   /* Now remove the unreachable part of the last iteration and cancel
 634      the loop.  */
 635   remove_path (ein);
 636
 637   if (dump_file)
 638     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 639 }
 640
 641 /* Decide whether to unroll LOOP iterating constant number of times
 642    and how much.  */
 643
 644 static void
 645 decide_unroll_constant_iterations (struct loop *loop, int flags)
 646 {
 647   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 648   struct niter_desc *desc;
 649   double_int iterations;
 650
 651   if (!(flags & UAP_UNROLL))
 652     {
 653       /* We were not asked to, just return back silently.  */
 654       return;
 655     }
 656
 657   if (dump_file)
 658     fprintf (dump_file,
 659              "\n;; Considering unrolling loop with constant "
 660              "number of iterations\n");
 661
 662   /* nunroll = total number of copies of the original loop body in
 663      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 664   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 665   nunroll_by_av
 666     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 667   if (nunroll > nunroll_by_av)
 668     nunroll = nunroll_by_av;
 669   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 670     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 671
 672   /* Skip big loops.  */
 673   if (nunroll <= 1)
 674     {
 675       if (dump_file)
 676         fprintf (dump_file, ";; Not considering loop, is too big\n");
 677       return;
 678     }
 679
 680   /* Check for simple loops.  */
 681   desc = get_simple_loop_desc (loop);
 682
 683   /* Check number of iterations.  */
 684   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 685     {
 686       if (dump_file)
 687         fprintf (dump_file,
 688                  ";; Unable to prove that the loop iterates constant times\n");
 689       return;
 690     }
 691
 692   /* Check whether the loop rolls enough to consider.
 693      Consult also loop bounds and profile; in the case the loop has more
 694      than one exit it may well loop less than determined maximal number
 695      of iterations.  */
 696   if (desc->niter < 2 * nunroll
 697       || ((estimated_loop_iterations (loop, &iterations)
 698            || max_loop_iterations (loop, &iterations))
 699           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 700     {
 701       if (dump_file)
 702         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 703       return;
 704     }
 705
 706   /* Success; now compute number of iterations to unroll.  We alter
 707      nunroll so that as few as possible copies of loop body are
 708      necessary, while still not decreasing the number of unrollings
 709      too much (at most by 1).  */
 710   best_copies = 2 * nunroll + 10;
 711
 712   i = 2 * nunroll + 2;
 713   if (i - 1 >= desc->niter)
 714     i = desc->niter - 2;
 715
 716   for (; i >= nunroll - 1; i--)
 717     {
 718       unsigned exit_mod = desc->niter % (i + 1);
 719
 720       if (!loop_exit_at_end_p (loop))
 721         n_copies = exit_mod + i + 1;
 722       else if (exit_mod != (unsigned) i
 723                || desc->noloop_assumptions != NULL_RTX)
 724         n_copies = exit_mod + i + 2;
 725       else
 726         n_copies = i + 1;
 727
 728       if (n_copies < best_copies)
 729         {
 730           best_copies = n_copies;
 731           best_unroll = i;
 732         }
 733     }
 734
 735   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 736   loop->lpt_decision.times = best_unroll;
 737 }
 738
 739 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 740    The transformation does this:
 741
 742    for (i = 0; i < 102; i++)
 743      body;
 744
 745    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 746
 747    i = 0;
 748    body; i++;
 749    body; i++;
 750    while (i < 102)
 751      {
 752        body; i++;
 753        body; i++;
 754        body; i++;
 755        body; i++;
 756      }
 757   */
 758 static void
 759 unroll_loop_constant_iterations (struct loop *loop)
 760 {
 761   unsigned HOST_WIDE_INT niter;
 762   unsigned exit_mod;
 763   sbitmap wont_exit;
 764   unsigned i;
 765   vec<edge> remove_edges;
 766   edge e;
 767   unsigned max_unroll = loop->lpt_decision.times;
 768   struct niter_desc *desc = get_simple_loop_desc (loop);
 769   bool exit_at_end = loop_exit_at_end_p (loop);
 770   struct opt_info *opt_info = NULL;
 771   bool ok;
 772
 773   niter = desc->niter;
 774
 775   /* Should not get here (such loop should be peeled instead).  */
 776   gcc_assert (niter > max_unroll + 1);
 777
 778   exit_mod = niter % (max_unroll + 1);
 779
 780   wont_exit = sbitmap_alloc (max_unroll + 1);
 781   bitmap_ones (wont_exit);
 782
 783   remove_edges.create (0);
 784   if (flag_split_ivs_in_unroller
 785       || flag_variable_expansion_in_unroller)
 786     opt_info = analyze_insns_in_loop (loop);
 787
 788   if (!exit_at_end)
 789     {
 790       /* The exit is not at the end of the loop; leave exit test
 791          in the first copy, so that the loops that start with test
 792          of exit condition have continuous body after unrolling.  */
 793
 794       if (dump_file)
 795         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 796
 797       /* Peel exit_mod iterations.  */
 798       bitmap_clear_bit (wont_exit, 0);
 799       if (desc->noloop_assumptions)
 800         bitmap_clear_bit (wont_exit, 1);
 801
 802       if (exit_mod)
 803         {
 804           opt_info_start_duplication (opt_info);
 805           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 806                                               exit_mod,
 807                                               wont_exit, desc->out_edge,
 808                                               &remove_edges,
 809                                               DLTHE_FLAG_UPDATE_FREQ
 810                                               | (opt_info && exit_mod > 1
 811                                                  ? DLTHE_RECORD_COPY_NUMBER
 812                                                    : 0));
 813           gcc_assert (ok);
 814
 815           if (opt_info && exit_mod > 1)
 816             apply_opt_in_copies (opt_info, exit_mod, false, false);
 817
 818           desc->noloop_assumptions = NULL_RTX;
 819           desc->niter -= exit_mod;
 820           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 821           if (loop->any_estimate
 822               && double_int::from_uhwi (exit_mod).ule
 823                    (loop->nb_iterations_estimate))
 824             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 825           else
 826             loop->any_estimate = false;
 827         }
 828
 829       bitmap_set_bit (wont_exit, 1);
 830     }
 831   else
 832     {
 833       /* Leave exit test in last copy, for the same reason as above if
 834          the loop tests the condition at the end of loop body.  */
 835
 836       if (dump_file)
 837         fprintf (dump_file, ";; Condition at end of loop.\n");
 838
 839       /* We know that niter >= max_unroll + 2; so we do not need to care of
 840          case when we would exit before reaching the loop.  So just peel
 841          exit_mod + 1 iterations.  */
 842       if (exit_mod != max_unroll
 843           || desc->noloop_assumptions)
 844         {
 845           bitmap_clear_bit (wont_exit, 0);
 846           if (desc->noloop_assumptions)
 847             bitmap_clear_bit (wont_exit, 1);
 848
 849           opt_info_start_duplication (opt_info);
 850           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 851                                               exit_mod + 1,
 852                                               wont_exit, desc->out_edge,
 853                                               &remove_edges,
 854                                               DLTHE_FLAG_UPDATE_FREQ
 855                                               | (opt_info && exit_mod > 0
 856                                                  ? DLTHE_RECORD_COPY_NUMBER
 857                                                    : 0));
 858           gcc_assert (ok);
 859
 860           if (opt_info && exit_mod > 0)
 861             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 862
 863           desc->niter -= exit_mod + 1;
 864           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 865           if (loop->any_estimate
 866               && double_int::from_uhwi (exit_mod + 1).ule
 867                    (loop->nb_iterations_estimate))
 868             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 869           else
 870             loop->any_estimate = false;
 871           desc->noloop_assumptions = NULL_RTX;
 872
 873           bitmap_set_bit (wont_exit, 0);
 874           bitmap_set_bit (wont_exit, 1);
 875         }
 876
 877       bitmap_clear_bit (wont_exit, max_unroll);
 878     }
 879
 880   /* Now unroll the loop.  */
 881
 882   opt_info_start_duplication (opt_info);
 883   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 884                                       max_unroll,
 885                                       wont_exit, desc->out_edge,
 886                                       &remove_edges,
 887                                       DLTHE_FLAG_UPDATE_FREQ
 888                                       | (opt_info
 889                                          ? DLTHE_RECORD_COPY_NUMBER
 890                                            : 0));
 891   gcc_assert (ok);
 892
 893   if (opt_info)
 894     {
 895       apply_opt_in_copies (opt_info, max_unroll, true, true);
 896       free_opt_info (opt_info);
 897     }
 898
 899   free (wont_exit);
 900
 901   if (exit_at_end)
 902     {
 903       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 904       /* Find a new in and out edge; they are in the last copy we have made.  */
 905
 906       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 907         {
 908           desc->out_edge = EDGE_SUCC (exit_block, 0);
 909           desc->in_edge = EDGE_SUCC (exit_block, 1);
 910         }
 911       else
 912         {
 913           desc->out_edge = EDGE_SUCC (exit_block, 1);
 914           desc->in_edge = EDGE_SUCC (exit_block, 0);
 915         }
 916     }
 917
 918   desc->niter /= max_unroll + 1;
 919   loop->nb_iterations_upper_bound
 920     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 921                                                                    + 1),
 922                                             TRUNC_DIV_EXPR);
 923   if (loop->any_estimate)
 924     loop->nb_iterations_estimate
 925       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 926                                                                   + 1),
 927                                            TRUNC_DIV_EXPR);
 928   desc->niter_expr = GEN_INT (desc->niter);
 929
 930   /* Remove the edges.  */
 931   FOR_EACH_VEC_ELT (remove_edges, i, e)
 932     remove_path (e);
 933   remove_edges.release ();
 934
 935   if (dump_file)
 936     fprintf (dump_file,
 937              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 938              max_unroll, num_loop_insns (loop));
 939 }
 940
 941 /* Decide whether to unroll LOOP iterating runtime computable number of times
 942    and how much.  */
 943 static void
 944 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 945 {
 946   unsigned nunroll, nunroll_by_av, i;
 947   struct niter_desc *desc;
 948   double_int iterations;
 949
 950   if (!(flags & UAP_UNROLL))
 951     {
 952       /* We were not asked to, just return back silently.  */
 953       return;
 954     }
 955
 956   if (dump_file)
 957     fprintf (dump_file,
 958              "\n;; Considering unrolling loop with runtime "
 959              "computable number of iterations\n");
 960
 961   /* nunroll = total number of copies of the original loop body in
 962      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 963   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 964   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 965   if (nunroll > nunroll_by_av)
 966     nunroll = nunroll_by_av;
 967   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 968     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 969
 970   if (targetm.loop_unroll_adjust)
 971     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 972
 973   /* Skip big loops.  */
 974   if (nunroll <= 1)
 975     {
 976       if (dump_file)
 977         fprintf (dump_file, ";; Not considering loop, is too big\n");
 978       return;
 979     }
 980
 981   /* Check for simple loops.  */
 982   desc = get_simple_loop_desc (loop);
 983
 984   /* Check simpleness.  */
 985   if (!desc->simple_p || desc->assumptions)
 986     {
 987       if (dump_file)
 988         fprintf (dump_file,
 989                  ";; Unable to prove that the number of iterations "
 990                  "can be counted in runtime\n");
 991       return;
 992     }
 993
 994   if (desc->const_iter)
 995     {
 996       if (dump_file)
 997         fprintf (dump_file, ";; Loop iterates constant times\n");
 998       return;
 999     }
1000
1001   /* Check whether the loop rolls.  */
1002   if ((estimated_loop_iterations (loop, &iterations)
1003        || max_loop_iterations (loop, &iterations))
1004       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1005     {
1006       if (dump_file)
1007         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1008       return;
1009     }
1010
1011   /* Success; now force nunroll to be power of 2, as we are unable to
1012      cope with overflows in computation of number of iterations.  */
1013   for (i = 1; 2 * i <= nunroll; i *= 2)
1014     continue;
1015
1016   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1017   loop->lpt_decision.times = i - 1;
1018 }
1019
1020 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1021    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1022    and NULL is returned instead.  */
1023
1024 basic_block
1025 split_edge_and_insert (edge e, rtx insns)
1026 {
1027   basic_block bb;
1028
1029   if (!insns)
1030     return NULL;
1031   bb = split_edge (e);
1032   emit_insn_after (insns, BB_END (bb));
1033
1034   /* ??? We used to assume that INSNS can contain control flow insns, and
1035      that we had to try to find sub basic blocks in BB to maintain a valid
1036      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1037      and call break_superblocks when going out of cfglayout mode.  But it
1038      turns out that this never happens; and that if it does ever happen,
1039      the TODO_verify_flow at the end of the RTL loop passes would fail.
1040
1041      There are two reasons why we expected we could have control flow insns
1042      in INSNS.  The first is when a comparison has to be done in parts, and
1043      the second is when the number of iterations is computed for loops with
1044      the number of iterations known at runtime.  In both cases, test cases
1045      to get control flow in INSNS appear to be impossible to construct:
1046
1047       * If do_compare_rtx_and_jump needs several branches to do comparison
1048         in a mode that needs comparison by parts, we cannot analyze the
1049         number of iterations of the loop, and we never get to unrolling it.
1050
1051       * The code in expand_divmod that was suspected to cause creation of
1052         branching code seems to be only accessed for signed division.  The
1053         divisions used by # of iterations analysis are always unsigned.
1054         Problems might arise on architectures that emits branching code
1055         for some operations that may appear in the unroller (especially
1056         for division), but we have no such architectures.
1057
1058      Considering all this, it was decided that we should for now assume
1059      that INSNS can in theory contain control flow insns, but in practice
1060      it never does.  So we don't handle the theoretical case, and should
1061      a real failure ever show up, we have a pretty good clue for how to
1062      fix it.  */
1063
1064   return bb;
1065 }
1066
1067 /* Unroll LOOP for which we are able to count number of iterations in runtime
1068    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1069    extra care for case n < 0):
1070
1071    for (i = 0; i < n; i++)
1072      body;
1073
1074    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1075
1076    i = 0;
1077    mod = n % 4;
1078
1079    switch (mod)
1080      {
1081        case 3:
1082          body; i++;
1083        case 2:
1084          body; i++;
1085        case 1:
1086          body; i++;
1087        case 0: ;
1088      }
1089
1090    while (i < n)
1091      {
1092        body; i++;
1093        body; i++;
1094        body; i++;
1095        body; i++;
1096      }
1097    */
1098 static void
1099 unroll_loop_runtime_iterations (struct loop *loop)
1100 {
1101   rtx old_niter, niter, init_code, branch_code, tmp;
1102   unsigned i, j, p;
1103   basic_block preheader, *body, swtch, ezc_swtch;
1104   vec<basic_block> dom_bbs;
1105   sbitmap wont_exit;
1106   int may_exit_copy;
1107   unsigned n_peel;
1108   vec<edge> remove_edges;
1109   edge e;
1110   bool extra_zero_check, last_may_exit;
1111   unsigned max_unroll = loop->lpt_decision.times;
1112   struct niter_desc *desc = get_simple_loop_desc (loop);
1113   bool exit_at_end = loop_exit_at_end_p (loop);
1114   struct opt_info *opt_info = NULL;
1115   bool ok;
1116
1117   if (flag_split_ivs_in_unroller
1118       || flag_variable_expansion_in_unroller)
1119     opt_info = analyze_insns_in_loop (loop);
1120
1121   /* Remember blocks whose dominators will have to be updated.  */
1122   dom_bbs.create (0);
1123
1124   body = get_loop_body (loop);
1125   for (i = 0; i < loop->num_nodes; i++)
1126     {
1127       vec<basic_block> ldom;
1128       basic_block bb;
1129
1130       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1131       FOR_EACH_VEC_ELT (ldom, j, bb)
1132         if (!flow_bb_inside_loop_p (loop, bb))
1133           dom_bbs.safe_push (bb);
1134
1135       ldom.release ();
1136     }
1137   free (body);
1138
1139   if (!exit_at_end)
1140     {
1141       /* Leave exit in first copy (for explanation why see comment in
1142          unroll_loop_constant_iterations).  */
1143       may_exit_copy = 0;
1144       n_peel = max_unroll - 1;
1145       extra_zero_check = true;
1146       last_may_exit = false;
1147     }
1148   else
1149     {
1150       /* Leave exit in last copy (for explanation why see comment in
1151          unroll_loop_constant_iterations).  */
1152       may_exit_copy = max_unroll;
1153       n_peel = max_unroll;
1154       extra_zero_check = false;
1155       last_may_exit = true;
1156     }
1157
1158   /* Get expression for number of iterations.  */
1159   start_sequence ();
1160   old_niter = niter = gen_reg_rtx (desc->mode);
1161   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1162   if (tmp != niter)
1163     emit_move_insn (niter, tmp);
1164
1165   /* Count modulo by ANDing it with max_unroll; we use the fact that
1166      the number of unrollings is a power of two, and thus this is correct
1167      even if there is overflow in the computation.  */
1168   niter = expand_simple_binop (desc->mode, AND,
1169                                niter,
1170                                GEN_INT (max_unroll),
1171                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1172
1173   init_code = get_insns ();
1174   end_sequence ();
1175   unshare_all_rtl_in_chain (init_code);
1176
1177   /* Precondition the loop.  */
1178   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1179
1180   remove_edges.create (0);
1181
1182   wont_exit = sbitmap_alloc (max_unroll + 2);
1183
1184   /* Peel the first copy of loop body (almost always we must leave exit test
1185      here; the only exception is when we have extra zero check and the number
1186      of iterations is reliable.  Also record the place of (possible) extra
1187      zero check.  */
1188   bitmap_clear (wont_exit);
1189   if (extra_zero_check
1190       && !desc->noloop_assumptions)
1191     bitmap_set_bit (wont_exit, 1);
1192   ezc_swtch = loop_preheader_edge (loop)->src;
1193   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1194                                       1, wont_exit, desc->out_edge,
1195                                       &remove_edges,
1196                                       DLTHE_FLAG_UPDATE_FREQ);
1197   gcc_assert (ok);
1198
1199   /* Record the place where switch will be built for preconditioning.  */
1200   swtch = split_edge (loop_preheader_edge (loop));
1201
1202   for (i = 0; i < n_peel; i++)
1203     {
1204       /* Peel the copy.  */
1205       bitmap_clear (wont_exit);
1206       if (i != n_peel - 1 || !last_may_exit)
1207         bitmap_set_bit (wont_exit, 1);
1208       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1209                                           1, wont_exit, desc->out_edge,
1210                                           &remove_edges,
1211                                           DLTHE_FLAG_UPDATE_FREQ);
1212       gcc_assert (ok);
1213
1214       /* Create item for switch.  */
1215       j = n_peel - i - (extra_zero_check ? 0 : 1);
1216       p = REG_BR_PROB_BASE / (i + 2);
1217
1218       preheader = split_edge (loop_preheader_edge (loop));
1219       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1220                                           block_label (preheader), p,
1221                                           NULL_RTX);
1222
1223       /* We rely on the fact that the compare and jump cannot be optimized out,
1224          and hence the cfg we create is correct.  */
1225       gcc_assert (branch_code != NULL_RTX);
1226
1227       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1228       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1229       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1230       e = make_edge (swtch, preheader,
1231                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1232       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1233       e->probability = p;
1234     }
1235
1236   if (extra_zero_check)
1237     {
1238       /* Add branch for zero iterations.  */
1239       p = REG_BR_PROB_BASE / (max_unroll + 1);
1240       swtch = ezc_swtch;
1241       preheader = split_edge (loop_preheader_edge (loop));
1242       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1243                                           block_label (preheader), p,
1244                                           NULL_RTX);
1245       gcc_assert (branch_code != NULL_RTX);
1246
1247       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1248       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1249       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1250       e = make_edge (swtch, preheader,
1251                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1252       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1253       e->probability = p;
1254     }
1255
1256   /* Recount dominators for outer blocks.  */
1257   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1258
1259   /* And unroll loop.  */
1260
1261   bitmap_ones (wont_exit);
1262   bitmap_clear_bit (wont_exit, may_exit_copy);
1263   opt_info_start_duplication (opt_info);
1264
1265   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1266                                       max_unroll,
1267                                       wont_exit, desc->out_edge,
1268                                       &remove_edges,
1269                                       DLTHE_FLAG_UPDATE_FREQ
1270                                       | (opt_info
1271                                          ? DLTHE_RECORD_COPY_NUMBER
1272                                            : 0));
1273   gcc_assert (ok);
1274
1275   if (opt_info)
1276     {
1277       apply_opt_in_copies (opt_info, max_unroll, true, true);
1278       free_opt_info (opt_info);
1279     }
1280
1281   free (wont_exit);
1282
1283   if (exit_at_end)
1284     {
1285       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1286       /* Find a new in and out edge; they are in the last copy we have
1287          made.  */
1288
1289       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1290         {
1291           desc->out_edge = EDGE_SUCC (exit_block, 0);
1292           desc->in_edge = EDGE_SUCC (exit_block, 1);
1293         }
1294       else
1295         {
1296           desc->out_edge = EDGE_SUCC (exit_block, 1);
1297           desc->in_edge = EDGE_SUCC (exit_block, 0);
1298         }
1299     }
1300
1301   /* Remove the edges.  */
1302   FOR_EACH_VEC_ELT (remove_edges, i, e)
1303     remove_path (e);
1304   remove_edges.release ();
1305
1306   /* We must be careful when updating the number of iterations due to
1307      preconditioning and the fact that the value must be valid at entry
1308      of the loop.  After passing through the above code, we see that
1309      the correct new number of iterations is this:  */
1310   gcc_assert (!desc->const_iter);
1311   desc->niter_expr =
1312     simplify_gen_binary (UDIV, desc->mode, old_niter,
1313                          GEN_INT (max_unroll + 1));
1314   loop->nb_iterations_upper_bound
1315     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1316                                                                    + 1),
1317                                             TRUNC_DIV_EXPR);
1318   if (loop->any_estimate)
1319     loop->nb_iterations_estimate
1320       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1321                                                                   + 1),
1322                                            TRUNC_DIV_EXPR);
1323   if (exit_at_end)
1324     {
1325       desc->niter_expr =
1326         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1327       desc->noloop_assumptions = NULL_RTX;
1328       --loop->nb_iterations_upper_bound;
1329       if (loop->any_estimate
1330           && loop->nb_iterations_estimate != double_int_zero)
1331         --loop->nb_iterations_estimate;
1332       else
1333         loop->any_estimate = false;
1334     }
1335
1336   if (dump_file)
1337     fprintf (dump_file,
1338              ";; Unrolled loop %d times, counting # of iterations "
1339              "in runtime, %i insns\n",
1340              max_unroll, num_loop_insns (loop));
1341
1342   dom_bbs.release ();
1343 }
1344
1345 /* Decide whether to simply peel LOOP and how much.  */
1346 static void
1347 decide_peel_simple (struct loop *loop, int flags)
1348 {
1349   unsigned npeel;
1350   double_int iterations;
1351
1352   if (!(flags & UAP_PEEL))
1353     {
1354       /* We were not asked to, just return back silently.  */
1355       return;
1356     }
1357
1358   if (dump_file)
1359     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1360
1361   /* npeel = number of iterations to peel.  */
1362   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1363   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1364     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1365
1366   /* Skip big loops.  */
1367   if (!npeel)
1368     {
1369       if (dump_file)
1370         fprintf (dump_file, ";; Not considering loop, is too big\n");
1371       return;
1372     }
1373
1374   /* Do not simply peel loops with branches inside -- it increases number
1375      of mispredicts.
1376      Exception is when we do have profile and we however have good chance
1377      to peel proper number of iterations loop will iterate in practice.
1378      TODO: this heuristic needs tunning; while for complette unrolling
1379      the branch inside loop mostly eliminates any improvements, for
1380      peeling it is not the case.  Also a function call inside loop is
1381      also branch from branch prediction POV (and probably better reason
1382      to not unroll/peel).  */
1383   if (num_loop_branches (loop) > 1
1384       && profile_status != PROFILE_READ)
1385     {
1386       if (dump_file)
1387         fprintf (dump_file, ";; Not peeling, contains branches\n");
1388       return;
1389     }
1390
1391   /* If we have realistic estimate on number of iterations, use it.  */
1392   if (estimated_loop_iterations (loop, &iterations))
1393     {
1394       if (double_int::from_shwi (npeel).ule (iterations))
1395         {
1396           if (dump_file)
1397             {
1398               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1399               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1400                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1401               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1402                        npeel);
1403             }
1404           return;
1405         }
1406       npeel = iterations.to_shwi () + 1;
1407     }
1408   /* If we have small enough bound on iterations, we can still peel (completely
1409      unroll).  */
1410   else if (max_loop_iterations (loop, &iterations)
1411            && iterations.ult (double_int::from_shwi (npeel)))
1412     npeel = iterations.to_shwi () + 1;
1413   else
1414     {
1415       /* For now we have no good heuristics to decide whether loop peeling
1416          will be effective, so disable it.  */
1417       if (dump_file)
1418         fprintf (dump_file,
1419                  ";; Not peeling loop, no evidence it will be profitable\n");
1420       return;
1421     }
1422
1423   /* Success.  */
1424   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1425   loop->lpt_decision.times = npeel;
1426 }
1427
1428 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1429
1430    while (cond)
1431      body;
1432
1433    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1434
1435    if (!cond) goto end;
1436    body;
1437    if (!cond) goto end;
1438    body;
1439    if (!cond) goto end;
1440    body;
1441    while (cond)
1442      body;
1443    end: ;
1444    */
1445 static void
1446 peel_loop_simple (struct loop *loop)
1447 {
1448   sbitmap wont_exit;
1449   unsigned npeel = loop->lpt_decision.times;
1450   struct niter_desc *desc = get_simple_loop_desc (loop);
1451   struct opt_info *opt_info = NULL;
1452   bool ok;
1453
1454   if (flag_split_ivs_in_unroller && npeel > 1)
1455     opt_info = analyze_insns_in_loop (loop);
1456
1457   wont_exit = sbitmap_alloc (npeel + 1);
1458   bitmap_clear (wont_exit);
1459
1460   opt_info_start_duplication (opt_info);
1461
1462   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1463                                       npeel, wont_exit, NULL,
1464                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1465                                       | (opt_info
1466                                          ? DLTHE_RECORD_COPY_NUMBER
1467                                            : 0));
1468   gcc_assert (ok);
1469
1470   free (wont_exit);
1471
1472   if (opt_info)
1473     {
1474       apply_opt_in_copies (opt_info, npeel, false, false);
1475       free_opt_info (opt_info);
1476     }
1477
1478   if (desc->simple_p)
1479     {
1480       if (desc->const_iter)
1481         {
1482           desc->niter -= npeel;
1483           desc->niter_expr = GEN_INT (desc->niter);
1484           desc->noloop_assumptions = NULL_RTX;
1485         }
1486       else
1487         {
1488           /* We cannot just update niter_expr, as its value might be clobbered
1489              inside loop.  We could handle this by counting the number into
1490              temporary just like we do in runtime unrolling, but it does not
1491              seem worthwhile.  */
1492           free_simple_loop_desc (loop);
1493         }
1494     }
1495   if (dump_file)
1496     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1497 }
1498
1499 /* Decide whether to unroll LOOP stupidly and how much.  */
1500 static void
1501 decide_unroll_stupid (struct loop *loop, int flags)
1502 {
1503   unsigned nunroll, nunroll_by_av, i;
1504   struct niter_desc *desc;
1505   double_int iterations;
1506
1507   if (!(flags & UAP_UNROLL_ALL))
1508     {
1509       /* We were not asked to, just return back silently.  */
1510       return;
1511     }
1512
1513   if (dump_file)
1514     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1515
1516   /* nunroll = total number of copies of the original loop body in
1517      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1518   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1519   nunroll_by_av
1520     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1521   if (nunroll > nunroll_by_av)
1522     nunroll = nunroll_by_av;
1523   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1524     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1525
1526   if (targetm.loop_unroll_adjust)
1527     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1528
1529   /* Skip big loops.  */
1530   if (nunroll <= 1)
1531     {
1532       if (dump_file)
1533         fprintf (dump_file, ";; Not considering loop, is too big\n");
1534       return;
1535     }
1536
1537   /* Check for simple loops.  */
1538   desc = get_simple_loop_desc (loop);
1539
1540   /* Check simpleness.  */
1541   if (desc->simple_p && !desc->assumptions)
1542     {
1543       if (dump_file)
1544         fprintf (dump_file, ";; The loop is simple\n");
1545       return;
1546     }
1547
1548   /* Do not unroll loops with branches inside -- it increases number
1549      of mispredicts.
1550      TODO: this heuristic needs tunning; call inside the loop body
1551      is also relatively good reason to not unroll.  */
1552   if (num_loop_branches (loop) > 1)
1553     {
1554       if (dump_file)
1555         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1556       return;
1557     }
1558
1559   /* Check whether the loop rolls.  */
1560   if ((estimated_loop_iterations (loop, &iterations)
1561        || max_loop_iterations (loop, &iterations))
1562       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1563     {
1564       if (dump_file)
1565         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1566       return;
1567     }
1568
1569   /* Success.  Now force nunroll to be power of 2, as it seems that this
1570      improves results (partially because of better alignments, partially
1571      because of some dark magic).  */
1572   for (i = 1; 2 * i <= nunroll; i *= 2)
1573     continue;
1574
1575   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1576   loop->lpt_decision.times = i - 1;
1577 }
1578
1579 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1580
1581    while (cond)
1582      body;
1583
1584    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1585
1586    while (cond)
1587      {
1588        body;
1589        if (!cond) break;
1590        body;
1591        if (!cond) break;
1592        body;
1593        if (!cond) break;
1594        body;
1595      }
1596    */
1597 static void
1598 unroll_loop_stupid (struct loop *loop)
1599 {
1600   sbitmap wont_exit;
1601   unsigned nunroll = loop->lpt_decision.times;
1602   struct niter_desc *desc = get_simple_loop_desc (loop);
1603   struct opt_info *opt_info = NULL;
1604   bool ok;
1605
1606   if (flag_split_ivs_in_unroller
1607       || flag_variable_expansion_in_unroller)
1608     opt_info = analyze_insns_in_loop (loop);
1609
1610
1611   wont_exit = sbitmap_alloc (nunroll + 1);
1612   bitmap_clear (wont_exit);
1613   opt_info_start_duplication (opt_info);
1614
1615   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1616                                       nunroll, wont_exit,
1617                                       NULL, NULL,
1618                                       DLTHE_FLAG_UPDATE_FREQ
1619                                       | (opt_info
1620                                          ? DLTHE_RECORD_COPY_NUMBER
1621                                            : 0));
1622   gcc_assert (ok);
1623
1624   if (opt_info)
1625     {
1626       apply_opt_in_copies (opt_info, nunroll, true, true);
1627       free_opt_info (opt_info);
1628     }
1629
1630   free (wont_exit);
1631
1632   if (desc->simple_p)
1633     {
1634       /* We indeed may get here provided that there are nontrivial assumptions
1635          for a loop to be really simple.  We could update the counts, but the
1636          problem is that we are unable to decide which exit will be taken
1637          (not really true in case the number of iterations is constant,
1638          but no one will do anything with this information, so we do not
1639          worry about it).  */
1640       desc->simple_p = false;
1641     }
1642
1643   if (dump_file)
1644     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1645              nunroll, num_loop_insns (loop));
1646 }
1647
1648 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1649    Set *DEBUG_USES to the number of debug insns that reference the
1650    variable.  */
1651
1652 bool
1653 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1654                                   int *debug_uses)
1655 {
1656   basic_block *body, bb;
1657   unsigned i;
1658   int count_ref = 0;
1659   rtx insn;
1660
1661   body = get_loop_body (loop);
1662   for (i = 0; i < loop->num_nodes; i++)
1663     {
1664       bb = body[i];
1665
1666       FOR_BB_INSNS (bb, insn)
1667         if (!rtx_referenced_p (reg, insn))
1668           continue;
1669         else if (DEBUG_INSN_P (insn))
1670           ++*debug_uses;
1671         else if (++count_ref > 1)
1672           break;
1673     }
1674   free (body);
1675   return (count_ref  == 1);
1676 }
1677
1678 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1679
1680 static void
1681 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1682 {
1683   basic_block *body, bb;
1684   unsigned i;
1685   rtx insn;
1686
1687   body = get_loop_body (loop);
1688   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1689     {
1690       bb = body[i];
1691
1692       FOR_BB_INSNS (bb, insn)
1693         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1694           continue;
1695         else
1696           {
1697             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1698                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1699             if (!--debug_uses)
1700               break;
1701           }
1702     }
1703   free (body);
1704 }
1705
1706 /* Determine whether INSN contains an accumulator
1707    which can be expanded into separate copies,
1708    one for each copy of the LOOP body.
1709
1710    for (i = 0 ; i < n; i++)
1711      sum += a[i];
1712
1713    ==>
1714
1715    sum += a[i]
1716    ....
1717    i = i+1;
1718    sum1 += a[i]
1719    ....
1720    i = i+1
1721    sum2 += a[i];
1722    ....
1723
1724    Return NULL if INSN contains no opportunity for expansion of accumulator.
1725    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1726    information and return a pointer to it.
1727 */
1728
1729 static struct var_to_expand *
1730 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1731 {
1732   rtx set, dest, src;
1733   struct var_to_expand *ves;
1734   unsigned accum_pos;
1735   enum rtx_code code;
1736   int debug_uses = 0;
1737
1738   set = single_set (insn);
1739   if (!set)
1740     return NULL;
1741
1742   dest = SET_DEST (set);
1743   src = SET_SRC (set);
1744   code = GET_CODE (src);
1745
1746   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1747     return NULL;
1748
1749   if (FLOAT_MODE_P (GET_MODE (dest)))
1750     {
1751       if (!flag_associative_math)
1752         return NULL;
1753       /* In the case of FMA, we're also changing the rounding.  */
1754       if (code == FMA && !flag_unsafe_math_optimizations)
1755         return NULL;
1756     }
1757
1758   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1759      in MD.  But if there is no optab to generate the insn, we can not
1760      perform the variable expansion.  This can happen if an MD provides
1761      an insn but not a named pattern to generate it, for example to avoid
1762      producing code that needs additional mode switches like for x87/mmx.
1763
1764      So we check have_insn_for which looks for an optab for the operation
1765      in SRC.  If it doesn't exist, we can't perform the expansion even
1766      though INSN is valid.  */
1767   if (!have_insn_for (code, GET_MODE (src)))
1768     return NULL;
1769
1770   if (!REG_P (dest)
1771       && !(GET_CODE (dest) == SUBREG
1772            && REG_P (SUBREG_REG (dest))))
1773     return NULL;
1774
1775   /* Find the accumulator use within the operation.  */
1776   if (code == FMA)
1777     {
1778       /* We only support accumulation via FMA in the ADD position.  */
1779       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1780         return NULL;
1781       accum_pos = 2;
1782     }
1783   else if (rtx_equal_p (dest, XEXP (src, 0)))
1784     accum_pos = 0;
1785   else if (rtx_equal_p (dest, XEXP (src, 1)))
1786     {
1787       /* The method of expansion that we are using; which includes the
1788          initialization of the expansions with zero and the summation of
1789          the expansions at the end of the computation will yield wrong
1790          results for (x = something - x) thus avoid using it in that case.  */
1791       if (code == MINUS)
1792         return NULL;
1793       accum_pos = 1;
1794     }
1795   else
1796     return NULL;
1797
1798   /* It must not otherwise be used.  */
1799   if (code == FMA)
1800     {
1801       if (rtx_referenced_p (dest, XEXP (src, 0))
1802           || rtx_referenced_p (dest, XEXP (src, 1)))
1803         return NULL;
1804     }
1805   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1806     return NULL;
1807
1808   /* It must be used in exactly one insn.  */
1809   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1810     return NULL;
1811
1812   if (dump_file)
1813     {
1814       fprintf (dump_file, "\n;; Expanding Accumulator ");
1815       print_rtl (dump_file, dest);
1816       fprintf (dump_file, "\n");
1817     }
1818
1819   if (debug_uses)
1820     /* Instead of resetting the debug insns, we could replace each
1821        debug use in the loop with the sum or product of all expanded
1822        accummulators.  Since we'll only know of all expansions at the
1823        end, we'd have to keep track of which vars_to_expand a debug
1824        insn in the loop references, take note of each copy of the
1825        debug insn during unrolling, and when it's all done, compute
1826        the sum or product of each variable and adjust the original
1827        debug insn and each copy thereof.  What a pain!  */
1828     reset_debug_uses_in_loop (loop, dest, debug_uses);
1829
1830   /* Record the accumulator to expand.  */
1831   ves = XNEW (struct var_to_expand);
1832   ves->insn = insn;
1833   ves->reg = copy_rtx (dest);
1834   ves->var_expansions.create (1);
1835   ves->next = NULL;
1836   ves->op = GET_CODE (src);
1837   ves->expansion_count = 0;
1838   ves->reuse_expansion = 0;
1839   return ves;
1840 }
1841
1842 /* Determine whether there is an induction variable in INSN that
1843    we would like to split during unrolling.
1844
1845    I.e. replace
1846
1847    i = i + 1;
1848    ...
1849    i = i + 1;
1850    ...
1851    i = i + 1;
1852    ...
1853
1854    type chains by
1855
1856    i0 = i + 1
1857    ...
1858    i = i0 + 1
1859    ...
1860    i = i0 + 2
1861    ...
1862
1863    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1864    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1865    pointer to it.  */
1866
1867 static struct iv_to_split *
1868 analyze_iv_to_split_insn (rtx insn)
1869 {
1870   rtx set, dest;
1871   struct rtx_iv iv;
1872   struct iv_to_split *ivts;
1873   bool ok;
1874
1875   /* For now we just split the basic induction variables.  Later this may be
1876      extended for example by selecting also addresses of memory references.  */
1877   set = single_set (insn);
1878   if (!set)
1879     return NULL;
1880
1881   dest = SET_DEST (set);
1882   if (!REG_P (dest))
1883     return NULL;
1884
1885   if (!biv_p (insn, dest))
1886     return NULL;
1887
1888   ok = iv_analyze_result (insn, dest, &iv);
1889
1890   /* This used to be an assert under the assumption that if biv_p returns
1891      true that iv_analyze_result must also return true.  However, that
1892      assumption is not strictly correct as evidenced by pr25569.
1893
1894      Returning NULL when iv_analyze_result returns false is safe and
1895      avoids the problems in pr25569 until the iv_analyze_* routines
1896      can be fixed, which is apparently hard and time consuming
1897      according to their author.  */
1898   if (! ok)
1899     return NULL;
1900
1901   if (iv.step == const0_rtx
1902       || iv.mode != iv.extend_mode)
1903     return NULL;
1904
1905   /* Record the insn to split.  */
1906   ivts = XNEW (struct iv_to_split);
1907   ivts->insn = insn;
1908   ivts->orig_var = dest;
1909   ivts->base_var = NULL_RTX;
1910   ivts->step = iv.step;
1911   ivts->next = NULL;
1912   ivts->n_loc = 1;
1913   ivts->loc[0] = 1;
1914
1915   return ivts;
1916 }
1917
1918 /* Determines which of insns in LOOP can be optimized.
1919    Return a OPT_INFO struct with the relevant hash tables filled
1920    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1921    is undefined for the return value.  */
1922
1923 static struct opt_info *
1924 analyze_insns_in_loop (struct loop *loop)
1925 {
1926   basic_block *body, bb;
1927   unsigned i;
1928   struct opt_info *opt_info = XCNEW (struct opt_info);
1929   rtx insn;
1930   struct iv_to_split *ivts = NULL;
1931   struct var_to_expand *ves = NULL;
1932   iv_to_split **slot1;
1933   var_to_expand **slot2;
1934   vec<edge> edges = get_loop_exit_edges (loop);
1935   edge exit;
1936   bool can_apply = false;
1937
1938   iv_analysis_loop_init (loop);
1939
1940   body = get_loop_body (loop);
1941
1942   if (flag_split_ivs_in_unroller)
1943     {
1944       opt_info->insns_to_split.create (5 * loop->num_nodes);
1945       opt_info->iv_to_split_head = NULL;
1946       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1947     }
1948
1949   /* Record the loop exit bb and loop preheader before the unrolling.  */
1950   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1951
1952   if (edges.length () == 1)
1953     {
1954       exit = edges[0];
1955       if (!(exit->flags & EDGE_COMPLEX))
1956         {
1957           opt_info->loop_exit = split_edge (exit);
1958           can_apply = true;
1959         }
1960     }
1961
1962   if (flag_variable_expansion_in_unroller
1963       && can_apply)
1964     {
1965       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1966       opt_info->var_to_expand_head = NULL;
1967       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1968     }
1969
1970   for (i = 0; i < loop->num_nodes; i++)
1971     {
1972       bb = body[i];
1973       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1974         continue;
1975
1976       FOR_BB_INSNS (bb, insn)
1977       {
1978         if (!INSN_P (insn))
1979           continue;
1980
1981         if (opt_info->insns_to_split.is_created ())
1982           ivts = analyze_iv_to_split_insn (insn);
1983
1984         if (ivts)
1985           {
1986             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1987             gcc_assert (*slot1 == NULL);
1988             *slot1 = ivts;
1989             *opt_info->iv_to_split_tail = ivts;
1990             opt_info->iv_to_split_tail = &ivts->next;
1991             continue;
1992           }
1993
1994         if (opt_info->insns_with_var_to_expand.is_created ())
1995           ves = analyze_insn_to_expand_var (loop, insn);
1996
1997         if (ves)
1998           {
1999             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
2000             gcc_assert (*slot2 == NULL);
2001             *slot2 = ves;
2002             *opt_info->var_to_expand_tail = ves;
2003             opt_info->var_to_expand_tail = &ves->next;
2004           }
2005       }
2006     }
2007
2008   edges.release ();
2009   free (body);
2010   return opt_info;
2011 }
2012
2013 /* Called just before loop duplication.  Records start of duplicated area
2014    to OPT_INFO.  */
2015
2016 static void
2017 opt_info_start_duplication (struct opt_info *opt_info)
2018 {
2019   if (opt_info)
2020     opt_info->first_new_block = last_basic_block;
2021 }
2022
2023 /* Determine the number of iterations between initialization of the base
2024    variable and the current copy (N_COPY).  N_COPIES is the total number
2025    of newly created copies.  UNROLLING is true if we are unrolling
2026    (not peeling) the loop.  */
2027
2028 static unsigned
2029 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2030 {
2031   if (unrolling)
2032     {
2033       /* If we are unrolling, initialization is done in the original loop
2034          body (number 0).  */
2035       return n_copy;
2036     }
2037   else
2038     {
2039       /* If we are peeling, the copy in that the initialization occurs has
2040          number 1.  The original loop (number 0) is the last.  */
2041       if (n_copy)
2042         return n_copy - 1;
2043       else
2044         return n_copies;
2045     }
2046 }
2047
2048 /* Locate in EXPR the expression corresponding to the location recorded
2049    in IVTS, and return a pointer to the RTX for this location.  */
2050
2051 static rtx *
2052 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2053 {
2054   unsigned i;
2055   rtx *ret = &expr;
2056
2057   for (i = 0; i < ivts->n_loc; i++)
2058     ret = &XEXP (*ret, ivts->loc[i]);
2059
2060   return ret;
2061 }
2062
2063 /* Allocate basic variable for the induction variable chain.  */
2064
2065 static void
2066 allocate_basic_variable (struct iv_to_split *ivts)
2067 {
2068   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2069
2070   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2071 }
2072
2073 /* Insert initialization of basic variable of IVTS before INSN, taking
2074    the initial value from INSN.  */
2075
2076 static void
2077 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2078 {
2079   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2080   rtx seq;
2081
2082   start_sequence ();
2083   expr = force_operand (expr, ivts->base_var);
2084   if (expr != ivts->base_var)
2085     emit_move_insn (ivts->base_var, expr);
2086   seq = get_insns ();
2087   end_sequence ();
2088
2089   emit_insn_before (seq, insn);
2090 }
2091
2092 /* Replace the use of induction variable described in IVTS in INSN
2093    by base variable + DELTA * step.  */
2094
2095 static void
2096 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2097 {
2098   rtx expr, *loc, seq, incr, var;
2099   enum machine_mode mode = GET_MODE (ivts->base_var);
2100   rtx src, dest, set;
2101
2102   /* Construct base + DELTA * step.  */
2103   if (!delta)
2104     expr = ivts->base_var;
2105   else
2106     {
2107       incr = simplify_gen_binary (MULT, mode,
2108                                   ivts->step, gen_int_mode (delta, mode));
2109       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2110                                   ivts->base_var, incr);
2111     }
2112
2113   /* Figure out where to do the replacement.  */
2114   loc = get_ivts_expr (single_set (insn), ivts);
2115
2116   /* If we can make the replacement right away, we're done.  */
2117   if (validate_change (insn, loc, expr, 0))
2118     return;
2119
2120   /* Otherwise, force EXPR into a register and try again.  */
2121   start_sequence ();
2122   var = gen_reg_rtx (mode);
2123   expr = force_operand (expr, var);
2124   if (expr != var)
2125     emit_move_insn (var, expr);
2126   seq = get_insns ();
2127   end_sequence ();
2128   emit_insn_before (seq, insn);
2129
2130   if (validate_change (insn, loc, var, 0))
2131     return;
2132
2133   /* The last chance.  Try recreating the assignment in insn
2134      completely from scratch.  */
2135   set = single_set (insn);
2136   gcc_assert (set);
2137
2138   start_sequence ();
2139   *loc = var;
2140   src = copy_rtx (SET_SRC (set));
2141   dest = copy_rtx (SET_DEST (set));
2142   src = force_operand (src, dest);
2143   if (src != dest)
2144     emit_move_insn (dest, src);
2145   seq = get_insns ();
2146   end_sequence ();
2147
2148   emit_insn_before (seq, insn);
2149   delete_insn (insn);
2150 }
2151
2152
2153 /* Return one expansion of the accumulator recorded in struct VE.  */
2154
2155 static rtx
2156 get_expansion (struct var_to_expand *ve)
2157 {
2158   rtx reg;
2159
2160   if (ve->reuse_expansion == 0)
2161     reg = ve->reg;
2162   else
2163     reg = ve->var_expansions[ve->reuse_expansion - 1];
2164
2165   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2166     ve->reuse_expansion = 0;
2167   else
2168     ve->reuse_expansion++;
2169
2170   return reg;
2171 }
2172
2173
2174 /* Given INSN replace the uses of the accumulator recorded in VE
2175    with a new register.  */
2176
2177 static void
2178 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2179 {
2180   rtx new_reg, set;
2181   bool really_new_expansion = false;
2182
2183   set = single_set (insn);
2184   gcc_assert (set);
2185
2186   /* Generate a new register only if the expansion limit has not been
2187      reached.  Else reuse an already existing expansion.  */
2188   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2189     {
2190       really_new_expansion = true;
2191       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2192     }
2193   else
2194     new_reg = get_expansion (ve);
2195
2196   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2197   if (apply_change_group ())
2198     if (really_new_expansion)
2199       {
2200         ve->var_expansions.safe_push (new_reg);
2201         ve->expansion_count++;
2202       }
2203 }
2204
2205 /* Initialize the variable expansions in loop preheader.  PLACE is the
2206    loop-preheader basic block where the initialization of the
2207    expansions should take place.  The expansions are initialized with
2208    (-0) when the operation is plus or minus to honor sign zero.  This
2209    way we can prevent cases where the sign of the final result is
2210    effected by the sign of the expansion.  Here is an example to
2211    demonstrate this:
2212
2213    for (i = 0 ; i < n; i++)
2214      sum += something;
2215
2216    ==>
2217
2218    sum += something
2219    ....
2220    i = i+1;
2221    sum1 += something
2222    ....
2223    i = i+1
2224    sum2 += something;
2225    ....
2226
2227    When SUM is initialized with -zero and SOMETHING is also -zero; the
2228    final result of sum should be -zero thus the expansions sum1 and sum2
2229    should be initialized with -zero as well (otherwise we will get +zero
2230    as the final result).  */
2231
2232 static void
2233 insert_var_expansion_initialization (struct var_to_expand *ve,
2234                                      basic_block place)
2235 {
2236   rtx seq, var, zero_init;
2237   unsigned i;
2238   enum machine_mode mode = GET_MODE (ve->reg);
2239   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2240
2241   if (ve->var_expansions.length () == 0)
2242     return;
2243
2244   start_sequence ();
2245   switch (ve->op)
2246     {
2247     case FMA:
2248       /* Note that we only accumulate FMA via the ADD operand.  */
2249     case PLUS:
2250     case MINUS:
2251       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2252         {
2253           if (honor_signed_zero_p)
2254             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2255           else
2256             zero_init = CONST0_RTX (mode);
2257           emit_move_insn (var, zero_init);
2258         }
2259       break;
2260
2261     case MULT:
2262       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2263         {
2264           zero_init = CONST1_RTX (GET_MODE (var));
2265           emit_move_insn (var, zero_init);
2266         }
2267       break;
2268
2269     default:
2270       gcc_unreachable ();
2271     }
2272
2273   seq = get_insns ();
2274   end_sequence ();
2275
2276   emit_insn_after (seq, BB_END (place));
2277 }
2278
2279 /* Combine the variable expansions at the loop exit.  PLACE is the
2280    loop exit basic block where the summation of the expansions should
2281    take place.  */
2282
2283 static void
2284 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2285 {
2286   rtx sum = ve->reg;
2287   rtx expr, seq, var, insn;
2288   unsigned i;
2289
2290   if (ve->var_expansions.length () == 0)
2291     return;
2292
2293   start_sequence ();
2294   switch (ve->op)
2295     {
2296     case FMA:
2297       /* Note that we only accumulate FMA via the ADD operand.  */
2298     case PLUS:
2299     case MINUS:
2300       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2301         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2302       break;
2303
2304     case MULT:
2305       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2306         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2307       break;
2308
2309     default:
2310       gcc_unreachable ();
2311     }
2312
2313   expr = force_operand (sum, ve->reg);
2314   if (expr != ve->reg)
2315     emit_move_insn (ve->reg, expr);
2316   seq = get_insns ();
2317   end_sequence ();
2318
2319   insn = BB_HEAD (place);
2320   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2321     insn = NEXT_INSN (insn);
2322
2323   emit_insn_after (seq, insn);
2324 }
2325
2326 /* Strip away REG_EQUAL notes for IVs we're splitting.
2327
2328    Updating REG_EQUAL notes for IVs we split is tricky: We
2329    cannot tell until after unrolling, DF-rescanning, and liveness
2330    updating, whether an EQ_USE is reached by the split IV while
2331    the IV reg is still live.  See PR55006.
2332
2333    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2334    because RTL loop-iv requires us to defer rescanning insns and
2335    any notes attached to them.  So resort to old techniques...  */
2336
2337 static void
2338 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2339 {
2340   struct iv_to_split *ivts;
2341   rtx note = find_reg_equal_equiv_note (insn);
2342   if (! note)
2343     return;
2344   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2345     if (reg_mentioned_p (ivts->orig_var, note))
2346       {
2347         remove_note (insn, note);
2348         return;
2349       }
2350 }
2351
2352 /* Apply loop optimizations in loop copies using the
2353    data which gathered during the unrolling.  Structure
2354    OPT_INFO record that data.
2355
2356    UNROLLING is true if we unrolled (not peeled) the loop.
2357    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2358    the loop (as it should happen in complete unrolling, but not in ordinary
2359    peeling of the loop).  */
2360
2361 static void
2362 apply_opt_in_copies (struct opt_info *opt_info,
2363                      unsigned n_copies, bool unrolling,
2364                      bool rewrite_original_loop)
2365 {
2366   unsigned i, delta;
2367   basic_block bb, orig_bb;
2368   rtx insn, orig_insn, next;
2369   struct iv_to_split ivts_templ, *ivts;
2370   struct var_to_expand ve_templ, *ves;
2371
2372   /* Sanity check -- we need to put initialization in the original loop
2373      body.  */
2374   gcc_assert (!unrolling || rewrite_original_loop);
2375
2376   /* Allocate the basic variables (i0).  */
2377   if (opt_info->insns_to_split.is_created ())
2378     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2379       allocate_basic_variable (ivts);
2380
2381   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2382     {
2383       bb = BASIC_BLOCK (i);
2384       orig_bb = get_bb_original (bb);
2385
2386       /* bb->aux holds position in copy sequence initialized by
2387          duplicate_loop_to_header_edge.  */
2388       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2389                                         unrolling);
2390       bb->aux = 0;
2391       orig_insn = BB_HEAD (orig_bb);
2392       FOR_BB_INSNS_SAFE (bb, insn, next)
2393         {
2394           if (!INSN_P (insn)
2395               || (DEBUG_INSN_P (insn)
2396                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2397             continue;
2398
2399           while (!INSN_P (orig_insn)
2400                  || (DEBUG_INSN_P (orig_insn)
2401                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2402                          == LABEL_DECL)))
2403             orig_insn = NEXT_INSN (orig_insn);
2404
2405           ivts_templ.insn = orig_insn;
2406           ve_templ.insn = orig_insn;
2407
2408           /* Apply splitting iv optimization.  */
2409           if (opt_info->insns_to_split.is_created ())
2410             {
2411               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2412
2413               ivts = opt_info->insns_to_split.find (&ivts_templ);
2414
2415               if (ivts)
2416                 {
2417                   gcc_assert (GET_CODE (PATTERN (insn))
2418                               == GET_CODE (PATTERN (orig_insn)));
2419
2420                   if (!delta)
2421                     insert_base_initialization (ivts, insn);
2422                   split_iv (ivts, insn, delta);
2423                 }
2424             }
2425           /* Apply variable expansion optimization.  */
2426           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2427             {
2428               ves = (struct var_to_expand *)
2429                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2430               if (ves)
2431                 {
2432                   gcc_assert (GET_CODE (PATTERN (insn))
2433                               == GET_CODE (PATTERN (orig_insn)));
2434                   expand_var_during_unrolling (ves, insn);
2435                 }
2436             }
2437           orig_insn = NEXT_INSN (orig_insn);
2438         }
2439     }
2440
2441   if (!rewrite_original_loop)
2442     return;
2443
2444   /* Initialize the variable expansions in the loop preheader
2445      and take care of combining them at the loop exit.  */
2446   if (opt_info->insns_with_var_to_expand.is_created ())
2447     {
2448       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2449         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2450       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2451         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2452     }
2453
2454   /* Rewrite also the original loop body.  Find them as originals of the blocks
2455      in the last copied iteration, i.e. those that have
2456      get_bb_copy (get_bb_original (bb)) == bb.  */
2457   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2458     {
2459       bb = BASIC_BLOCK (i);
2460       orig_bb = get_bb_original (bb);
2461       if (get_bb_copy (orig_bb) != bb)
2462         continue;
2463
2464       delta = determine_split_iv_delta (0, n_copies, unrolling);
2465       for (orig_insn = BB_HEAD (orig_bb);
2466            orig_insn != NEXT_INSN (BB_END (bb));
2467            orig_insn = next)
2468         {
2469           next = NEXT_INSN (orig_insn);
2470
2471           if (!INSN_P (orig_insn))
2472             continue;
2473
2474           ivts_templ.insn = orig_insn;
2475           if (opt_info->insns_to_split.is_created ())
2476             {
2477               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2478
2479               ivts = (struct iv_to_split *)
2480                 opt_info->insns_to_split.find (&ivts_templ);
2481               if (ivts)
2482                 {
2483                   if (!delta)
2484                     insert_base_initialization (ivts, orig_insn);
2485                   split_iv (ivts, orig_insn, delta);
2486                   continue;
2487                 }
2488             }
2489
2490         }
2491     }
2492 }
2493
2494 /* Release OPT_INFO.  */
2495
2496 static void
2497 free_opt_info (struct opt_info *opt_info)
2498 {
2499   if (opt_info->insns_to_split.is_created ())
2500     opt_info->insns_to_split.dispose ();
2501   if (opt_info->insns_with_var_to_expand.is_created ())
2502     {
2503       struct var_to_expand *ves;
2504
2505       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2506         ves->var_expansions.release ();
2507       opt_info->insns_with_var_to_expand.dispose ();
2508     }
2509   free (opt_info);
2510 }