gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hash-table.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Hashtable helper for iv_to_split.  */
 106
 107 struct iv_split_hasher : typed_free_remove <iv_to_split>
 108 {
 109   typedef iv_to_split value_type;
 110   typedef iv_to_split compare_type;
 111   static inline hashval_t hash (const value_type *);
 112   static inline bool equal (const value_type *, const compare_type *);
 113 };
 114
 115
 116 /* A hash function for information about insns to split.  */
 117
 118 inline hashval_t
 119 iv_split_hasher::hash (const value_type *ivts)
 120 {
 121   return (hashval_t) INSN_UID (ivts->insn);
 122 }
 123
 124 /* An equality functions for information about insns to split.  */
 125
 126 inline bool
 127 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 128 {
 129   return i1->insn == i2->insn;
 130 }
 131
 132 /* Hashtable helper for iv_to_split.  */
 133
 134 struct var_expand_hasher : typed_free_remove <var_to_expand>
 135 {
 136   typedef var_to_expand value_type;
 137   typedef var_to_expand compare_type;
 138   static inline hashval_t hash (const value_type *);
 139   static inline bool equal (const value_type *, const compare_type *);
 140 };
 141
 142 /* Return a hash for VES.  */
 143
 144 inline hashval_t
 145 var_expand_hasher::hash (const value_type *ves)
 146 {
 147   return (hashval_t) INSN_UID (ves->insn);
 148 }
 149
 150 /* Return true if I1 and I2 refer to the same instruction.  */
 151
 152 inline bool
 153 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 154 {
 155   return i1->insn == i2->insn;
 156 }
 157
 158 /* Information about optimization applied in
 159    the unrolled loop.  */
 160
 161 struct opt_info
 162 {
 163   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 164                                                   split.  */
 165   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 166   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 167   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 168                                         insns with accumulators to expand.  */
 169   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 170   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 171   unsigned first_new_block;        /* The first basic block that was
 172                                       duplicated.  */
 173   basic_block loop_exit;           /* The loop exit basic block.  */
 174   basic_block loop_preheader;      /* The loop preheader basic block.  */
 175 };
 176
 177 static void decide_unrolling_and_peeling (int);
 178 static void peel_loops_completely (int);
 179 static void decide_peel_simple (struct loop *, int);
 180 static void decide_peel_once_rolling (struct loop *, int);
 181 static void decide_peel_completely (struct loop *, int);
 182 static void decide_unroll_stupid (struct loop *, int);
 183 static void decide_unroll_constant_iterations (struct loop *, int);
 184 static void decide_unroll_runtime_iterations (struct loop *, int);
 185 static void peel_loop_simple (struct loop *);
 186 static void peel_loop_completely (struct loop *);
 187 static void unroll_loop_stupid (struct loop *);
 188 static void unroll_loop_constant_iterations (struct loop *);
 189 static void unroll_loop_runtime_iterations (struct loop *);
 190 static struct opt_info *analyze_insns_in_loop (struct loop *);
 191 static void opt_info_start_duplication (struct opt_info *);
 192 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 193 static void free_opt_info (struct opt_info *);
 194 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 195 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 196 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 197 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 198 static void insert_var_expansion_initialization (struct var_to_expand *,
 199                                                  basic_block);
 200 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 201                                              basic_block);
 202 static rtx get_expansion (struct var_to_expand *);
 203
 204 /* Emit a message summarizing the unroll or peel that will be
 205    performed for LOOP, along with the loop's location LOCUS, if
 206    appropriate given the dump or -fopt-info settings.  */
 207
 208 static void
 209 report_unroll_peel (struct loop *loop, location_t locus)
 210 {
 211   struct niter_desc *desc;
 212   int niters = 0;
 213   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 214
 215   if (loop->lpt_decision.decision == LPT_NONE)
 216     return;
 217
 218   if (!dump_enabled_p ())
 219     return;
 220
 221   /* In the special case where the loop never iterated, emit
 222      a different message so that we don't report an unroll by 0.
 223      This matches the equivalent message emitted during tree unrolling.  */
 224   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 225       && !loop->lpt_decision.times)
 226     {
 227       dump_printf_loc (report_flags, locus,
 228                        "loop turned into non-loop; it never loops.\n");
 229       return;
 230     }
 231
 232   desc = get_simple_loop_desc (loop);
 233
 234   if (desc->const_iter)
 235     niters = desc->niter;
 236   else if (loop->header->count)
 237     niters = expected_loop_iterations (loop);
 238
 239   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 240     dump_printf_loc (report_flags, locus,
 241                      "loop with %d iterations completely unrolled",
 242                      loop->lpt_decision.times + 1);
 243   else
 244     dump_printf_loc (report_flags, locus,
 245                      "loop %s %d times",
 246                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 247                        ? "peeled" : "unrolled"),
 248                      loop->lpt_decision.times);
 249   if (profile_info)
 250     dump_printf (report_flags,
 251                  " (header execution count %d",
 252                  (int)loop->header->count);
 253   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254     dump_printf (report_flags,
 255                  "%s%s iterations %d)",
 256                  profile_info ? ", " : " (",
 257                  desc->const_iter ? "const" : "average",
 258                  niters);
 259   else if (profile_info)
 260     dump_printf (report_flags, ")");
 261
 262   dump_printf (report_flags, "\n");
 263 }
 264
 265 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 266 void
 267 unroll_and_peel_loops (int flags)
 268 {
 269   struct loop *loop;
 270   bool changed = false;
 271   loop_iterator li;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   loop_iterator li;
 348   bool changed = false;
 349
 350   /* Scan the loops, the inner ones first.  */
 351   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 352     {
 353       loop->lpt_decision.decision = LPT_NONE;
 354       location_t locus = get_loop_location (loop);
 355
 356       if (dump_enabled_p ())
 357         dump_printf_loc (TDF_RTL, locus,
 358                          ";; *** Considering loop %d at BB %d for "
 359                          "complete peeling ***\n",
 360                          loop->num, loop->header->index);
 361
 362       loop->ninsns = num_loop_insns (loop);
 363
 364       decide_peel_once_rolling (loop, flags);
 365       if (loop->lpt_decision.decision == LPT_NONE)
 366         decide_peel_completely (loop, flags);
 367
 368       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 369         {
 370           report_unroll_peel (loop, locus);
 371           peel_loop_completely (loop);
 372           changed = true;
 373         }
 374     }
 375
 376     if (changed)
 377       {
 378         calculate_dominance_info (CDI_DOMINATORS);
 379         fix_loop_structure (NULL);
 380       }
 381 }
 382
 383 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 384 static void
 385 decide_unrolling_and_peeling (int flags)
 386 {
 387   struct loop *loop;
 388   loop_iterator li;
 389
 390   /* Scan the loops, inner ones first.  */
 391   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 392     {
 393       loop->lpt_decision.decision = LPT_NONE;
 394       location_t locus = get_loop_location (loop);
 395
 396       if (dump_enabled_p ())
 397         dump_printf_loc (TDF_RTL, locus,
 398                          ";; *** Considering loop %d at BB %d for "
 399                          "unrolling and peeling ***\n",
 400                          loop->num, loop->header->index);
 401
 402       /* Do not peel cold areas.  */
 403       if (optimize_loop_for_size_p (loop))
 404         {
 405           if (dump_file)
 406             fprintf (dump_file, ";; Not considering loop, cold area\n");
 407           continue;
 408         }
 409
 410       /* Can the loop be manipulated?  */
 411       if (!can_duplicate_loop_p (loop))
 412         {
 413           if (dump_file)
 414             fprintf (dump_file,
 415                      ";; Not considering loop, cannot duplicate\n");
 416           continue;
 417         }
 418
 419       /* Skip non-innermost loops.  */
 420       if (loop->inner)
 421         {
 422           if (dump_file)
 423             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 424           continue;
 425         }
 426
 427       loop->ninsns = num_loop_insns (loop);
 428       loop->av_ninsns = average_num_loop_insns (loop);
 429
 430       /* Try transformations one by one in decreasing order of
 431          priority.  */
 432
 433       decide_unroll_constant_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_runtime_iterations (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_unroll_stupid (loop, flags);
 438       if (loop->lpt_decision.decision == LPT_NONE)
 439         decide_peel_simple (loop, flags);
 440
 441       report_unroll_peel (loop, locus);
 442     }
 443 }
 444
 445 /* Decide whether the LOOP is once rolling and suitable for complete
 446    peeling.  */
 447 static void
 448 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 449 {
 450   struct niter_desc *desc;
 451
 452   if (dump_file)
 453     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 454
 455   /* Is the loop small enough?  */
 456   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 457     {
 458       if (dump_file)
 459         fprintf (dump_file, ";; Not considering loop, is too big\n");
 460       return;
 461     }
 462
 463   /* Check for simple loops.  */
 464   desc = get_simple_loop_desc (loop);
 465
 466   /* Check number of iterations.  */
 467   if (!desc->simple_p
 468       || desc->assumptions
 469       || desc->infinite
 470       || !desc->const_iter
 471       || (desc->niter != 0
 472           && max_loop_iterations_int (loop) != 0))
 473     {
 474       if (dump_file)
 475         fprintf (dump_file,
 476                  ";; Unable to prove that the loop rolls exactly once\n");
 477       return;
 478     }
 479
 480   /* Success.  */
 481   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 482 }
 483
 484 /* Decide whether the LOOP is suitable for complete peeling.  */
 485 static void
 486 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 487 {
 488   unsigned npeel;
 489   struct niter_desc *desc;
 490
 491   if (dump_file)
 492     fprintf (dump_file, "\n;; Considering peeling completely\n");
 493
 494   /* Skip non-innermost loops.  */
 495   if (loop->inner)
 496     {
 497       if (dump_file)
 498         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 499       return;
 500     }
 501
 502   /* Do not peel cold areas.  */
 503   if (optimize_loop_for_size_p (loop))
 504     {
 505       if (dump_file)
 506         fprintf (dump_file, ";; Not considering loop, cold area\n");
 507       return;
 508     }
 509
 510   /* Can the loop be manipulated?  */
 511   if (!can_duplicate_loop_p (loop))
 512     {
 513       if (dump_file)
 514         fprintf (dump_file,
 515                  ";; Not considering loop, cannot duplicate\n");
 516       return;
 517     }
 518
 519   /* npeel = number of iterations to peel.  */
 520   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 521   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 522     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 523
 524   /* Is the loop small enough?  */
 525   if (!npeel)
 526     {
 527       if (dump_file)
 528         fprintf (dump_file, ";; Not considering loop, is too big\n");
 529       return;
 530     }
 531
 532   /* Check for simple loops.  */
 533   desc = get_simple_loop_desc (loop);
 534
 535   /* Check number of iterations.  */
 536   if (!desc->simple_p
 537       || desc->assumptions
 538       || !desc->const_iter
 539       || desc->infinite)
 540     {
 541       if (dump_file)
 542         fprintf (dump_file,
 543                  ";; Unable to prove that the loop iterates constant times\n");
 544       return;
 545     }
 546
 547   if (desc->niter > npeel - 1)
 548     {
 549       if (dump_file)
 550         {
 551           fprintf (dump_file,
 552                    ";; Not peeling loop completely, rolls too much (");
 553           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 554           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 555         }
 556       return;
 557     }
 558
 559   /* Success.  */
 560   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 561 }
 562
 563 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 564    completely.  The transformation done:
 565
 566    for (i = 0; i < 4; i++)
 567      body;
 568
 569    ==>
 570
 571    i = 0;
 572    body; i++;
 573    body; i++;
 574    body; i++;
 575    body; i++;
 576    */
 577 static void
 578 peel_loop_completely (struct loop *loop)
 579 {
 580   sbitmap wont_exit;
 581   unsigned HOST_WIDE_INT npeel;
 582   unsigned i;
 583   vec<edge> remove_edges;
 584   edge ein;
 585   struct niter_desc *desc = get_simple_loop_desc (loop);
 586   struct opt_info *opt_info = NULL;
 587
 588   npeel = desc->niter;
 589
 590   if (npeel)
 591     {
 592       bool ok;
 593
 594       wont_exit = sbitmap_alloc (npeel + 1);
 595       bitmap_ones (wont_exit);
 596       bitmap_clear_bit (wont_exit, 0);
 597       if (desc->noloop_assumptions)
 598         bitmap_clear_bit (wont_exit, 1);
 599
 600       remove_edges.create (0);
 601
 602       if (flag_split_ivs_in_unroller)
 603         opt_info = analyze_insns_in_loop (loop);
 604
 605       opt_info_start_duplication (opt_info);
 606       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 607                                           npeel,
 608                                           wont_exit, desc->out_edge,
 609                                           &remove_edges,
 610                                           DLTHE_FLAG_UPDATE_FREQ
 611                                           | DLTHE_FLAG_COMPLETTE_PEEL
 612                                           | (opt_info
 613                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 614       gcc_assert (ok);
 615
 616       free (wont_exit);
 617
 618       if (opt_info)
 619         {
 620           apply_opt_in_copies (opt_info, npeel, false, true);
 621           free_opt_info (opt_info);
 622         }
 623
 624       /* Remove the exit edges.  */
 625       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 626         remove_path (ein);
 627       remove_edges.release ();
 628     }
 629
 630   ein = desc->in_edge;
 631   free_simple_loop_desc (loop);
 632
 633   /* Now remove the unreachable part of the last iteration and cancel
 634      the loop.  */
 635   remove_path (ein);
 636
 637   if (dump_file)
 638     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 639 }
 640
 641 /* Decide whether to unroll LOOP iterating constant number of times
 642    and how much.  */
 643
 644 static void
 645 decide_unroll_constant_iterations (struct loop *loop, int flags)
 646 {
 647   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 648   struct niter_desc *desc;
 649   double_int iterations;
 650
 651   if (!(flags & UAP_UNROLL))
 652     {
 653       /* We were not asked to, just return back silently.  */
 654       return;
 655     }
 656
 657   if (dump_file)
 658     fprintf (dump_file,
 659              "\n;; Considering unrolling loop with constant "
 660              "number of iterations\n");
 661
 662   /* nunroll = total number of copies of the original loop body in
 663      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 664   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 665   nunroll_by_av
 666     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 667   if (nunroll > nunroll_by_av)
 668     nunroll = nunroll_by_av;
 669   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 670     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 671
 672   /* Skip big loops.  */
 673   if (nunroll <= 1)
 674     {
 675       if (dump_file)
 676         fprintf (dump_file, ";; Not considering loop, is too big\n");
 677       return;
 678     }
 679
 680   /* Check for simple loops.  */
 681   desc = get_simple_loop_desc (loop);
 682
 683   /* Check number of iterations.  */
 684   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 685     {
 686       if (dump_file)
 687         fprintf (dump_file,
 688                  ";; Unable to prove that the loop iterates constant times\n");
 689       return;
 690     }
 691
 692   /* Check whether the loop rolls enough to consider.
 693      Consult also loop bounds and profile; in the case the loop has more
 694      than one exit it may well loop less than determined maximal number
 695      of iterations.  */
 696   if (desc->niter < 2 * nunroll
 697       || ((estimated_loop_iterations (loop, &iterations)
 698            || max_loop_iterations (loop, &iterations))
 699           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 700     {
 701       if (dump_file)
 702         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 703       return;
 704     }
 705
 706   /* Success; now compute number of iterations to unroll.  We alter
 707      nunroll so that as few as possible copies of loop body are
 708      necessary, while still not decreasing the number of unrollings
 709      too much (at most by 1).  */
 710   best_copies = 2 * nunroll + 10;
 711
 712   i = 2 * nunroll + 2;
 713   if (i - 1 >= desc->niter)
 714     i = desc->niter - 2;
 715
 716   for (; i >= nunroll - 1; i--)
 717     {
 718       unsigned exit_mod = desc->niter % (i + 1);
 719
 720       if (!loop_exit_at_end_p (loop))
 721         n_copies = exit_mod + i + 1;
 722       else if (exit_mod != (unsigned) i
 723                || desc->noloop_assumptions != NULL_RTX)
 724         n_copies = exit_mod + i + 2;
 725       else
 726         n_copies = i + 1;
 727
 728       if (n_copies < best_copies)
 729         {
 730           best_copies = n_copies;
 731           best_unroll = i;
 732         }
 733     }
 734
 735   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 736   loop->lpt_decision.times = best_unroll;
 737 }
 738
 739 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 740    The transformation does this:
 741
 742    for (i = 0; i < 102; i++)
 743      body;
 744
 745    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 746
 747    i = 0;
 748    body; i++;
 749    body; i++;
 750    while (i < 102)
 751      {
 752        body; i++;
 753        body; i++;
 754        body; i++;
 755        body; i++;
 756      }
 757   */
 758 static void
 759 unroll_loop_constant_iterations (struct loop *loop)
 760 {
 761   unsigned HOST_WIDE_INT niter;
 762   unsigned exit_mod;
 763   sbitmap wont_exit;
 764   unsigned i;
 765   vec<edge> remove_edges;
 766   edge e;
 767   unsigned max_unroll = loop->lpt_decision.times;
 768   struct niter_desc *desc = get_simple_loop_desc (loop);
 769   bool exit_at_end = loop_exit_at_end_p (loop);
 770   struct opt_info *opt_info = NULL;
 771   bool ok;
 772
 773   niter = desc->niter;
 774
 775   /* Should not get here (such loop should be peeled instead).  */
 776   gcc_assert (niter > max_unroll + 1);
 777
 778   exit_mod = niter % (max_unroll + 1);
 779
 780   wont_exit = sbitmap_alloc (max_unroll + 1);
 781   bitmap_ones (wont_exit);
 782
 783   remove_edges.create (0);
 784   if (flag_split_ivs_in_unroller
 785       || flag_variable_expansion_in_unroller)
 786     opt_info = analyze_insns_in_loop (loop);
 787
 788   if (!exit_at_end)
 789     {
 790       /* The exit is not at the end of the loop; leave exit test
 791          in the first copy, so that the loops that start with test
 792          of exit condition have continuous body after unrolling.  */
 793
 794       if (dump_file)
 795         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 796
 797       /* Peel exit_mod iterations.  */
 798       bitmap_clear_bit (wont_exit, 0);
 799       if (desc->noloop_assumptions)
 800         bitmap_clear_bit (wont_exit, 1);
 801
 802       if (exit_mod)
 803         {
 804           opt_info_start_duplication (opt_info);
 805           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 806                                               exit_mod,
 807                                               wont_exit, desc->out_edge,
 808                                               &remove_edges,
 809                                               DLTHE_FLAG_UPDATE_FREQ
 810                                               | (opt_info && exit_mod > 1
 811                                                  ? DLTHE_RECORD_COPY_NUMBER
 812                                                    : 0));
 813           gcc_assert (ok);
 814
 815           if (opt_info && exit_mod > 1)
 816             apply_opt_in_copies (opt_info, exit_mod, false, false);
 817
 818           desc->noloop_assumptions = NULL_RTX;
 819           desc->niter -= exit_mod;
 820           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 821           if (loop->any_estimate
 822               && double_int::from_uhwi (exit_mod).ule
 823                    (loop->nb_iterations_estimate))
 824             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 825           else
 826             loop->any_estimate = false;
 827         }
 828
 829       bitmap_set_bit (wont_exit, 1);
 830     }
 831   else
 832     {
 833       /* Leave exit test in last copy, for the same reason as above if
 834          the loop tests the condition at the end of loop body.  */
 835
 836       if (dump_file)
 837         fprintf (dump_file, ";; Condition at end of loop.\n");
 838
 839       /* We know that niter >= max_unroll + 2; so we do not need to care of
 840          case when we would exit before reaching the loop.  So just peel
 841          exit_mod + 1 iterations.  */
 842       if (exit_mod != max_unroll
 843           || desc->noloop_assumptions)
 844         {
 845           bitmap_clear_bit (wont_exit, 0);
 846           if (desc->noloop_assumptions)
 847             bitmap_clear_bit (wont_exit, 1);
 848
 849           opt_info_start_duplication (opt_info);
 850           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 851                                               exit_mod + 1,
 852                                               wont_exit, desc->out_edge,
 853                                               &remove_edges,
 854                                               DLTHE_FLAG_UPDATE_FREQ
 855                                               | (opt_info && exit_mod > 0
 856                                                  ? DLTHE_RECORD_COPY_NUMBER
 857                                                    : 0));
 858           gcc_assert (ok);
 859
 860           if (opt_info && exit_mod > 0)
 861             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 862
 863           desc->niter -= exit_mod + 1;
 864           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 865           if (loop->any_estimate
 866               && double_int::from_uhwi (exit_mod + 1).ule
 867                    (loop->nb_iterations_estimate))
 868             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 869           else
 870             loop->any_estimate = false;
 871           desc->noloop_assumptions = NULL_RTX;
 872
 873           bitmap_set_bit (wont_exit, 0);
 874           bitmap_set_bit (wont_exit, 1);
 875         }
 876
 877       bitmap_clear_bit (wont_exit, max_unroll);
 878     }
 879
 880   /* Now unroll the loop.  */
 881
 882   opt_info_start_duplication (opt_info);
 883   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 884                                       max_unroll,
 885                                       wont_exit, desc->out_edge,
 886                                       &remove_edges,
 887                                       DLTHE_FLAG_UPDATE_FREQ
 888                                       | (opt_info
 889                                          ? DLTHE_RECORD_COPY_NUMBER
 890                                            : 0));
 891   gcc_assert (ok);
 892
 893   if (opt_info)
 894     {
 895       apply_opt_in_copies (opt_info, max_unroll, true, true);
 896       free_opt_info (opt_info);
 897     }
 898
 899   free (wont_exit);
 900
 901   if (exit_at_end)
 902     {
 903       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 904       /* Find a new in and out edge; they are in the last copy we have made.  */
 905
 906       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 907         {
 908           desc->out_edge = EDGE_SUCC (exit_block, 0);
 909           desc->in_edge = EDGE_SUCC (exit_block, 1);
 910         }
 911       else
 912         {
 913           desc->out_edge = EDGE_SUCC (exit_block, 1);
 914           desc->in_edge = EDGE_SUCC (exit_block, 0);
 915         }
 916     }
 917
 918   desc->niter /= max_unroll + 1;
 919   loop->nb_iterations_upper_bound
 920     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 921                                                                    + 1),
 922                                             TRUNC_DIV_EXPR);
 923   if (loop->any_estimate)
 924     loop->nb_iterations_estimate
 925       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 926                                                                   + 1),
 927                                            TRUNC_DIV_EXPR);
 928   desc->niter_expr = GEN_INT (desc->niter);
 929
 930   /* Remove the edges.  */
 931   FOR_EACH_VEC_ELT (remove_edges, i, e)
 932     remove_path (e);
 933   remove_edges.release ();
 934
 935   if (dump_file)
 936     fprintf (dump_file,
 937              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 938              max_unroll, num_loop_insns (loop));
 939 }
 940
 941 /* Decide whether to unroll LOOP iterating runtime computable number of times
 942    and how much.  */
 943 static void
 944 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 945 {
 946   unsigned nunroll, nunroll_by_av, i;
 947   struct niter_desc *desc;
 948   double_int iterations;
 949
 950   if (!(flags & UAP_UNROLL))
 951     {
 952       /* We were not asked to, just return back silently.  */
 953       return;
 954     }
 955
 956   if (dump_file)
 957     fprintf (dump_file,
 958              "\n;; Considering unrolling loop with runtime "
 959              "computable number of iterations\n");
 960
 961   /* nunroll = total number of copies of the original loop body in
 962      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 963   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 964   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 965   if (nunroll > nunroll_by_av)
 966     nunroll = nunroll_by_av;
 967   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 968     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 969
 970   if (targetm.loop_unroll_adjust)
 971     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 972
 973   /* Skip big loops.  */
 974   if (nunroll <= 1)
 975     {
 976       if (dump_file)
 977         fprintf (dump_file, ";; Not considering loop, is too big\n");
 978       return;
 979     }
 980
 981   /* Check for simple loops.  */
 982   desc = get_simple_loop_desc (loop);
 983
 984   /* Check simpleness.  */
 985   if (!desc->simple_p || desc->assumptions)
 986     {
 987       if (dump_file)
 988         fprintf (dump_file,
 989                  ";; Unable to prove that the number of iterations "
 990                  "can be counted in runtime\n");
 991       return;
 992     }
 993
 994   if (desc->const_iter)
 995     {
 996       if (dump_file)
 997         fprintf (dump_file, ";; Loop iterates constant times\n");
 998       return;
 999     }
1000
1001   /* Check whether the loop rolls.  */
1002   if ((estimated_loop_iterations (loop, &iterations)
1003        || max_loop_iterations (loop, &iterations))
1004       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1005     {
1006       if (dump_file)
1007         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1008       return;
1009     }
1010
1011   /* Success; now force nunroll to be power of 2, as we are unable to
1012      cope with overflows in computation of number of iterations.  */
1013   for (i = 1; 2 * i <= nunroll; i *= 2)
1014     continue;
1015
1016   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1017   loop->lpt_decision.times = i - 1;
1018 }
1019
1020 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1021    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1022    and NULL is returned instead.  */
1023
1024 basic_block
1025 split_edge_and_insert (edge e, rtx insns)
1026 {
1027   basic_block bb;
1028
1029   if (!insns)
1030     return NULL;
1031   bb = split_edge (e);
1032   emit_insn_after (insns, BB_END (bb));
1033
1034   /* ??? We used to assume that INSNS can contain control flow insns, and
1035      that we had to try to find sub basic blocks in BB to maintain a valid
1036      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1037      and call break_superblocks when going out of cfglayout mode.  But it
1038      turns out that this never happens; and that if it does ever happen,
1039      the TODO_verify_flow at the end of the RTL loop passes would fail.
1040
1041      There are two reasons why we expected we could have control flow insns
1042      in INSNS.  The first is when a comparison has to be done in parts, and
1043      the second is when the number of iterations is computed for loops with
1044      the number of iterations known at runtime.  In both cases, test cases
1045      to get control flow in INSNS appear to be impossible to construct:
1046
1047       * If do_compare_rtx_and_jump needs several branches to do comparison
1048         in a mode that needs comparison by parts, we cannot analyze the
1049         number of iterations of the loop, and we never get to unrolling it.
1050
1051       * The code in expand_divmod that was suspected to cause creation of
1052         branching code seems to be only accessed for signed division.  The
1053         divisions used by # of iterations analysis are always unsigned.
1054         Problems might arise on architectures that emits branching code
1055         for some operations that may appear in the unroller (especially
1056         for division), but we have no such architectures.
1057
1058      Considering all this, it was decided that we should for now assume
1059      that INSNS can in theory contain control flow insns, but in practice
1060      it never does.  So we don't handle the theoretical case, and should
1061      a real failure ever show up, we have a pretty good clue for how to
1062      fix it.  */
1063
1064   return bb;
1065 }
1066
1067 /* Unroll LOOP for which we are able to count number of iterations in runtime
1068    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1069    extra care for case n < 0):
1070
1071    for (i = 0; i < n; i++)
1072      body;
1073
1074    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1075
1076    i = 0;
1077    mod = n % 4;
1078
1079    switch (mod)
1080      {
1081        case 3:
1082          body; i++;
1083        case 2:
1084          body; i++;
1085        case 1:
1086          body; i++;
1087        case 0: ;
1088      }
1089
1090    while (i < n)
1091      {
1092        body; i++;
1093        body; i++;
1094        body; i++;
1095        body; i++;
1096      }
1097    */
1098 static void
1099 unroll_loop_runtime_iterations (struct loop *loop)
1100 {
1101   rtx old_niter, niter, init_code, branch_code, tmp;
1102   unsigned i, j, p;
1103   basic_block preheader, *body, swtch, ezc_swtch;
1104   vec<basic_block> dom_bbs;
1105   sbitmap wont_exit;
1106   int may_exit_copy;
1107   unsigned n_peel;
1108   vec<edge> remove_edges;
1109   edge e;
1110   bool extra_zero_check, last_may_exit;
1111   unsigned max_unroll = loop->lpt_decision.times;
1112   struct niter_desc *desc = get_simple_loop_desc (loop);
1113   bool exit_at_end = loop_exit_at_end_p (loop);
1114   struct opt_info *opt_info = NULL;
1115   bool ok;
1116
1117   if (flag_split_ivs_in_unroller
1118       || flag_variable_expansion_in_unroller)
1119     opt_info = analyze_insns_in_loop (loop);
1120
1121   /* Remember blocks whose dominators will have to be updated.  */
1122   dom_bbs.create (0);
1123
1124   body = get_loop_body (loop);
1125   for (i = 0; i < loop->num_nodes; i++)
1126     {
1127       vec<basic_block> ldom;
1128       basic_block bb;
1129
1130       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1131       FOR_EACH_VEC_ELT (ldom, j, bb)
1132         if (!flow_bb_inside_loop_p (loop, bb))
1133           dom_bbs.safe_push (bb);
1134
1135       ldom.release ();
1136     }
1137   free (body);
1138
1139   if (!exit_at_end)
1140     {
1141       /* Leave exit in first copy (for explanation why see comment in
1142          unroll_loop_constant_iterations).  */
1143       may_exit_copy = 0;
1144       n_peel = max_unroll - 1;
1145       extra_zero_check = true;
1146       last_may_exit = false;
1147     }
1148   else
1149     {
1150       /* Leave exit in last copy (for explanation why see comment in
1151          unroll_loop_constant_iterations).  */
1152       may_exit_copy = max_unroll;
1153       n_peel = max_unroll;
1154       extra_zero_check = false;
1155       last_may_exit = true;
1156     }
1157
1158   /* Get expression for number of iterations.  */
1159   start_sequence ();
1160   old_niter = niter = gen_reg_rtx (desc->mode);
1161   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1162   if (tmp != niter)
1163     emit_move_insn (niter, tmp);
1164
1165   /* Count modulo by ANDing it with max_unroll; we use the fact that
1166      the number of unrollings is a power of two, and thus this is correct
1167      even if there is overflow in the computation.  */
1168   niter = expand_simple_binop (desc->mode, AND,
1169                                niter, gen_int_mode (max_unroll, desc->mode),
1170                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1171
1172   init_code = get_insns ();
1173   end_sequence ();
1174   unshare_all_rtl_in_chain (init_code);
1175
1176   /* Precondition the loop.  */
1177   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1178
1179   remove_edges.create (0);
1180
1181   wont_exit = sbitmap_alloc (max_unroll + 2);
1182
1183   /* Peel the first copy of loop body (almost always we must leave exit test
1184      here; the only exception is when we have extra zero check and the number
1185      of iterations is reliable.  Also record the place of (possible) extra
1186      zero check.  */
1187   bitmap_clear (wont_exit);
1188   if (extra_zero_check
1189       && !desc->noloop_assumptions)
1190     bitmap_set_bit (wont_exit, 1);
1191   ezc_swtch = loop_preheader_edge (loop)->src;
1192   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1193                                       1, wont_exit, desc->out_edge,
1194                                       &remove_edges,
1195                                       DLTHE_FLAG_UPDATE_FREQ);
1196   gcc_assert (ok);
1197
1198   /* Record the place where switch will be built for preconditioning.  */
1199   swtch = split_edge (loop_preheader_edge (loop));
1200
1201   for (i = 0; i < n_peel; i++)
1202     {
1203       /* Peel the copy.  */
1204       bitmap_clear (wont_exit);
1205       if (i != n_peel - 1 || !last_may_exit)
1206         bitmap_set_bit (wont_exit, 1);
1207       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1208                                           1, wont_exit, desc->out_edge,
1209                                           &remove_edges,
1210                                           DLTHE_FLAG_UPDATE_FREQ);
1211       gcc_assert (ok);
1212
1213       /* Create item for switch.  */
1214       j = n_peel - i - (extra_zero_check ? 0 : 1);
1215       p = REG_BR_PROB_BASE / (i + 2);
1216
1217       preheader = split_edge (loop_preheader_edge (loop));
1218       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1219                                           block_label (preheader), p,
1220                                           NULL_RTX);
1221
1222       /* We rely on the fact that the compare and jump cannot be optimized out,
1223          and hence the cfg we create is correct.  */
1224       gcc_assert (branch_code != NULL_RTX);
1225
1226       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1227       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1228       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1229       e = make_edge (swtch, preheader,
1230                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1231       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1232       e->probability = p;
1233     }
1234
1235   if (extra_zero_check)
1236     {
1237       /* Add branch for zero iterations.  */
1238       p = REG_BR_PROB_BASE / (max_unroll + 1);
1239       swtch = ezc_swtch;
1240       preheader = split_edge (loop_preheader_edge (loop));
1241       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1242                                           block_label (preheader), p,
1243                                           NULL_RTX);
1244       gcc_assert (branch_code != NULL_RTX);
1245
1246       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1247       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1248       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1249       e = make_edge (swtch, preheader,
1250                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1251       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1252       e->probability = p;
1253     }
1254
1255   /* Recount dominators for outer blocks.  */
1256   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1257
1258   /* And unroll loop.  */
1259
1260   bitmap_ones (wont_exit);
1261   bitmap_clear_bit (wont_exit, may_exit_copy);
1262   opt_info_start_duplication (opt_info);
1263
1264   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1265                                       max_unroll,
1266                                       wont_exit, desc->out_edge,
1267                                       &remove_edges,
1268                                       DLTHE_FLAG_UPDATE_FREQ
1269                                       | (opt_info
1270                                          ? DLTHE_RECORD_COPY_NUMBER
1271                                            : 0));
1272   gcc_assert (ok);
1273
1274   if (opt_info)
1275     {
1276       apply_opt_in_copies (opt_info, max_unroll, true, true);
1277       free_opt_info (opt_info);
1278     }
1279
1280   free (wont_exit);
1281
1282   if (exit_at_end)
1283     {
1284       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1285       /* Find a new in and out edge; they are in the last copy we have
1286          made.  */
1287
1288       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1289         {
1290           desc->out_edge = EDGE_SUCC (exit_block, 0);
1291           desc->in_edge = EDGE_SUCC (exit_block, 1);
1292         }
1293       else
1294         {
1295           desc->out_edge = EDGE_SUCC (exit_block, 1);
1296           desc->in_edge = EDGE_SUCC (exit_block, 0);
1297         }
1298     }
1299
1300   /* Remove the edges.  */
1301   FOR_EACH_VEC_ELT (remove_edges, i, e)
1302     remove_path (e);
1303   remove_edges.release ();
1304
1305   /* We must be careful when updating the number of iterations due to
1306      preconditioning and the fact that the value must be valid at entry
1307      of the loop.  After passing through the above code, we see that
1308      the correct new number of iterations is this:  */
1309   gcc_assert (!desc->const_iter);
1310   desc->niter_expr =
1311     simplify_gen_binary (UDIV, desc->mode, old_niter,
1312                          gen_int_mode (max_unroll + 1, desc->mode));
1313   loop->nb_iterations_upper_bound
1314     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1315                                                                    + 1),
1316                                             TRUNC_DIV_EXPR);
1317   if (loop->any_estimate)
1318     loop->nb_iterations_estimate
1319       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1320                                                                   + 1),
1321                                            TRUNC_DIV_EXPR);
1322   if (exit_at_end)
1323     {
1324       desc->niter_expr =
1325         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1326       desc->noloop_assumptions = NULL_RTX;
1327       --loop->nb_iterations_upper_bound;
1328       if (loop->any_estimate
1329           && loop->nb_iterations_estimate != double_int_zero)
1330         --loop->nb_iterations_estimate;
1331       else
1332         loop->any_estimate = false;
1333     }
1334
1335   if (dump_file)
1336     fprintf (dump_file,
1337              ";; Unrolled loop %d times, counting # of iterations "
1338              "in runtime, %i insns\n",
1339              max_unroll, num_loop_insns (loop));
1340
1341   dom_bbs.release ();
1342 }
1343
1344 /* Decide whether to simply peel LOOP and how much.  */
1345 static void
1346 decide_peel_simple (struct loop *loop, int flags)
1347 {
1348   unsigned npeel;
1349   double_int iterations;
1350
1351   if (!(flags & UAP_PEEL))
1352     {
1353       /* We were not asked to, just return back silently.  */
1354       return;
1355     }
1356
1357   if (dump_file)
1358     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1359
1360   /* npeel = number of iterations to peel.  */
1361   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1362   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1363     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1364
1365   /* Skip big loops.  */
1366   if (!npeel)
1367     {
1368       if (dump_file)
1369         fprintf (dump_file, ";; Not considering loop, is too big\n");
1370       return;
1371     }
1372
1373   /* Do not simply peel loops with branches inside -- it increases number
1374      of mispredicts.
1375      Exception is when we do have profile and we however have good chance
1376      to peel proper number of iterations loop will iterate in practice.
1377      TODO: this heuristic needs tunning; while for complette unrolling
1378      the branch inside loop mostly eliminates any improvements, for
1379      peeling it is not the case.  Also a function call inside loop is
1380      also branch from branch prediction POV (and probably better reason
1381      to not unroll/peel).  */
1382   if (num_loop_branches (loop) > 1
1383       && profile_status != PROFILE_READ)
1384     {
1385       if (dump_file)
1386         fprintf (dump_file, ";; Not peeling, contains branches\n");
1387       return;
1388     }
1389
1390   /* If we have realistic estimate on number of iterations, use it.  */
1391   if (estimated_loop_iterations (loop, &iterations))
1392     {
1393       if (double_int::from_shwi (npeel).ule (iterations))
1394         {
1395           if (dump_file)
1396             {
1397               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1398               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1399                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1400               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1401                        npeel);
1402             }
1403           return;
1404         }
1405       npeel = iterations.to_shwi () + 1;
1406     }
1407   /* If we have small enough bound on iterations, we can still peel (completely
1408      unroll).  */
1409   else if (max_loop_iterations (loop, &iterations)
1410            && iterations.ult (double_int::from_shwi (npeel)))
1411     npeel = iterations.to_shwi () + 1;
1412   else
1413     {
1414       /* For now we have no good heuristics to decide whether loop peeling
1415          will be effective, so disable it.  */
1416       if (dump_file)
1417         fprintf (dump_file,
1418                  ";; Not peeling loop, no evidence it will be profitable\n");
1419       return;
1420     }
1421
1422   /* Success.  */
1423   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1424   loop->lpt_decision.times = npeel;
1425 }
1426
1427 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1428
1429    while (cond)
1430      body;
1431
1432    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1433
1434    if (!cond) goto end;
1435    body;
1436    if (!cond) goto end;
1437    body;
1438    if (!cond) goto end;
1439    body;
1440    while (cond)
1441      body;
1442    end: ;
1443    */
1444 static void
1445 peel_loop_simple (struct loop *loop)
1446 {
1447   sbitmap wont_exit;
1448   unsigned npeel = loop->lpt_decision.times;
1449   struct niter_desc *desc = get_simple_loop_desc (loop);
1450   struct opt_info *opt_info = NULL;
1451   bool ok;
1452
1453   if (flag_split_ivs_in_unroller && npeel > 1)
1454     opt_info = analyze_insns_in_loop (loop);
1455
1456   wont_exit = sbitmap_alloc (npeel + 1);
1457   bitmap_clear (wont_exit);
1458
1459   opt_info_start_duplication (opt_info);
1460
1461   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1462                                       npeel, wont_exit, NULL,
1463                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1464                                       | (opt_info
1465                                          ? DLTHE_RECORD_COPY_NUMBER
1466                                            : 0));
1467   gcc_assert (ok);
1468
1469   free (wont_exit);
1470
1471   if (opt_info)
1472     {
1473       apply_opt_in_copies (opt_info, npeel, false, false);
1474       free_opt_info (opt_info);
1475     }
1476
1477   if (desc->simple_p)
1478     {
1479       if (desc->const_iter)
1480         {
1481           desc->niter -= npeel;
1482           desc->niter_expr = GEN_INT (desc->niter);
1483           desc->noloop_assumptions = NULL_RTX;
1484         }
1485       else
1486         {
1487           /* We cannot just update niter_expr, as its value might be clobbered
1488              inside loop.  We could handle this by counting the number into
1489              temporary just like we do in runtime unrolling, but it does not
1490              seem worthwhile.  */
1491           free_simple_loop_desc (loop);
1492         }
1493     }
1494   if (dump_file)
1495     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1496 }
1497
1498 /* Decide whether to unroll LOOP stupidly and how much.  */
1499 static void
1500 decide_unroll_stupid (struct loop *loop, int flags)
1501 {
1502   unsigned nunroll, nunroll_by_av, i;
1503   struct niter_desc *desc;
1504   double_int iterations;
1505
1506   if (!(flags & UAP_UNROLL_ALL))
1507     {
1508       /* We were not asked to, just return back silently.  */
1509       return;
1510     }
1511
1512   if (dump_file)
1513     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1514
1515   /* nunroll = total number of copies of the original loop body in
1516      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1517   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1518   nunroll_by_av
1519     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1520   if (nunroll > nunroll_by_av)
1521     nunroll = nunroll_by_av;
1522   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1523     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1524
1525   if (targetm.loop_unroll_adjust)
1526     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1527
1528   /* Skip big loops.  */
1529   if (nunroll <= 1)
1530     {
1531       if (dump_file)
1532         fprintf (dump_file, ";; Not considering loop, is too big\n");
1533       return;
1534     }
1535
1536   /* Check for simple loops.  */
1537   desc = get_simple_loop_desc (loop);
1538
1539   /* Check simpleness.  */
1540   if (desc->simple_p && !desc->assumptions)
1541     {
1542       if (dump_file)
1543         fprintf (dump_file, ";; The loop is simple\n");
1544       return;
1545     }
1546
1547   /* Do not unroll loops with branches inside -- it increases number
1548      of mispredicts.
1549      TODO: this heuristic needs tunning; call inside the loop body
1550      is also relatively good reason to not unroll.  */
1551   if (num_loop_branches (loop) > 1)
1552     {
1553       if (dump_file)
1554         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1555       return;
1556     }
1557
1558   /* Check whether the loop rolls.  */
1559   if ((estimated_loop_iterations (loop, &iterations)
1560        || max_loop_iterations (loop, &iterations))
1561       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1562     {
1563       if (dump_file)
1564         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1565       return;
1566     }
1567
1568   /* Success.  Now force nunroll to be power of 2, as it seems that this
1569      improves results (partially because of better alignments, partially
1570      because of some dark magic).  */
1571   for (i = 1; 2 * i <= nunroll; i *= 2)
1572     continue;
1573
1574   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1575   loop->lpt_decision.times = i - 1;
1576 }
1577
1578 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1579
1580    while (cond)
1581      body;
1582
1583    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1584
1585    while (cond)
1586      {
1587        body;
1588        if (!cond) break;
1589        body;
1590        if (!cond) break;
1591        body;
1592        if (!cond) break;
1593        body;
1594      }
1595    */
1596 static void
1597 unroll_loop_stupid (struct loop *loop)
1598 {
1599   sbitmap wont_exit;
1600   unsigned nunroll = loop->lpt_decision.times;
1601   struct niter_desc *desc = get_simple_loop_desc (loop);
1602   struct opt_info *opt_info = NULL;
1603   bool ok;
1604
1605   if (flag_split_ivs_in_unroller
1606       || flag_variable_expansion_in_unroller)
1607     opt_info = analyze_insns_in_loop (loop);
1608
1609
1610   wont_exit = sbitmap_alloc (nunroll + 1);
1611   bitmap_clear (wont_exit);
1612   opt_info_start_duplication (opt_info);
1613
1614   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1615                                       nunroll, wont_exit,
1616                                       NULL, NULL,
1617                                       DLTHE_FLAG_UPDATE_FREQ
1618                                       | (opt_info
1619                                          ? DLTHE_RECORD_COPY_NUMBER
1620                                            : 0));
1621   gcc_assert (ok);
1622
1623   if (opt_info)
1624     {
1625       apply_opt_in_copies (opt_info, nunroll, true, true);
1626       free_opt_info (opt_info);
1627     }
1628
1629   free (wont_exit);
1630
1631   if (desc->simple_p)
1632     {
1633       /* We indeed may get here provided that there are nontrivial assumptions
1634          for a loop to be really simple.  We could update the counts, but the
1635          problem is that we are unable to decide which exit will be taken
1636          (not really true in case the number of iterations is constant,
1637          but no one will do anything with this information, so we do not
1638          worry about it).  */
1639       desc->simple_p = false;
1640     }
1641
1642   if (dump_file)
1643     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1644              nunroll, num_loop_insns (loop));
1645 }
1646
1647 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1648    Set *DEBUG_USES to the number of debug insns that reference the
1649    variable.  */
1650
1651 bool
1652 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1653                                   int *debug_uses)
1654 {
1655   basic_block *body, bb;
1656   unsigned i;
1657   int count_ref = 0;
1658   rtx insn;
1659
1660   body = get_loop_body (loop);
1661   for (i = 0; i < loop->num_nodes; i++)
1662     {
1663       bb = body[i];
1664
1665       FOR_BB_INSNS (bb, insn)
1666         if (!rtx_referenced_p (reg, insn))
1667           continue;
1668         else if (DEBUG_INSN_P (insn))
1669           ++*debug_uses;
1670         else if (++count_ref > 1)
1671           break;
1672     }
1673   free (body);
1674   return (count_ref  == 1);
1675 }
1676
1677 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1678
1679 static void
1680 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1681 {
1682   basic_block *body, bb;
1683   unsigned i;
1684   rtx insn;
1685
1686   body = get_loop_body (loop);
1687   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1688     {
1689       bb = body[i];
1690
1691       FOR_BB_INSNS (bb, insn)
1692         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1693           continue;
1694         else
1695           {
1696             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1697                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1698             if (!--debug_uses)
1699               break;
1700           }
1701     }
1702   free (body);
1703 }
1704
1705 /* Determine whether INSN contains an accumulator
1706    which can be expanded into separate copies,
1707    one for each copy of the LOOP body.
1708
1709    for (i = 0 ; i < n; i++)
1710      sum += a[i];
1711
1712    ==>
1713
1714    sum += a[i]
1715    ....
1716    i = i+1;
1717    sum1 += a[i]
1718    ....
1719    i = i+1
1720    sum2 += a[i];
1721    ....
1722
1723    Return NULL if INSN contains no opportunity for expansion of accumulator.
1724    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1725    information and return a pointer to it.
1726 */
1727
1728 static struct var_to_expand *
1729 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1730 {
1731   rtx set, dest, src;
1732   struct var_to_expand *ves;
1733   unsigned accum_pos;
1734   enum rtx_code code;
1735   int debug_uses = 0;
1736
1737   set = single_set (insn);
1738   if (!set)
1739     return NULL;
1740
1741   dest = SET_DEST (set);
1742   src = SET_SRC (set);
1743   code = GET_CODE (src);
1744
1745   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1746     return NULL;
1747
1748   if (FLOAT_MODE_P (GET_MODE (dest)))
1749     {
1750       if (!flag_associative_math)
1751         return NULL;
1752       /* In the case of FMA, we're also changing the rounding.  */
1753       if (code == FMA && !flag_unsafe_math_optimizations)
1754         return NULL;
1755     }
1756
1757   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1758      in MD.  But if there is no optab to generate the insn, we can not
1759      perform the variable expansion.  This can happen if an MD provides
1760      an insn but not a named pattern to generate it, for example to avoid
1761      producing code that needs additional mode switches like for x87/mmx.
1762
1763      So we check have_insn_for which looks for an optab for the operation
1764      in SRC.  If it doesn't exist, we can't perform the expansion even
1765      though INSN is valid.  */
1766   if (!have_insn_for (code, GET_MODE (src)))
1767     return NULL;
1768
1769   if (!REG_P (dest)
1770       && !(GET_CODE (dest) == SUBREG
1771            && REG_P (SUBREG_REG (dest))))
1772     return NULL;
1773
1774   /* Find the accumulator use within the operation.  */
1775   if (code == FMA)
1776     {
1777       /* We only support accumulation via FMA in the ADD position.  */
1778       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1779         return NULL;
1780       accum_pos = 2;
1781     }
1782   else if (rtx_equal_p (dest, XEXP (src, 0)))
1783     accum_pos = 0;
1784   else if (rtx_equal_p (dest, XEXP (src, 1)))
1785     {
1786       /* The method of expansion that we are using; which includes the
1787          initialization of the expansions with zero and the summation of
1788          the expansions at the end of the computation will yield wrong
1789          results for (x = something - x) thus avoid using it in that case.  */
1790       if (code == MINUS)
1791         return NULL;
1792       accum_pos = 1;
1793     }
1794   else
1795     return NULL;
1796
1797   /* It must not otherwise be used.  */
1798   if (code == FMA)
1799     {
1800       if (rtx_referenced_p (dest, XEXP (src, 0))
1801           || rtx_referenced_p (dest, XEXP (src, 1)))
1802         return NULL;
1803     }
1804   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1805     return NULL;
1806
1807   /* It must be used in exactly one insn.  */
1808   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1809     return NULL;
1810
1811   if (dump_file)
1812     {
1813       fprintf (dump_file, "\n;; Expanding Accumulator ");
1814       print_rtl (dump_file, dest);
1815       fprintf (dump_file, "\n");
1816     }
1817
1818   if (debug_uses)
1819     /* Instead of resetting the debug insns, we could replace each
1820        debug use in the loop with the sum or product of all expanded
1821        accummulators.  Since we'll only know of all expansions at the
1822        end, we'd have to keep track of which vars_to_expand a debug
1823        insn in the loop references, take note of each copy of the
1824        debug insn during unrolling, and when it's all done, compute
1825        the sum or product of each variable and adjust the original
1826        debug insn and each copy thereof.  What a pain!  */
1827     reset_debug_uses_in_loop (loop, dest, debug_uses);
1828
1829   /* Record the accumulator to expand.  */
1830   ves = XNEW (struct var_to_expand);
1831   ves->insn = insn;
1832   ves->reg = copy_rtx (dest);
1833   ves->var_expansions.create (1);
1834   ves->next = NULL;
1835   ves->op = GET_CODE (src);
1836   ves->expansion_count = 0;
1837   ves->reuse_expansion = 0;
1838   return ves;
1839 }
1840
1841 /* Determine whether there is an induction variable in INSN that
1842    we would like to split during unrolling.
1843
1844    I.e. replace
1845
1846    i = i + 1;
1847    ...
1848    i = i + 1;
1849    ...
1850    i = i + 1;
1851    ...
1852
1853    type chains by
1854
1855    i0 = i + 1
1856    ...
1857    i = i0 + 1
1858    ...
1859    i = i0 + 2
1860    ...
1861
1862    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1863    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1864    pointer to it.  */
1865
1866 static struct iv_to_split *
1867 analyze_iv_to_split_insn (rtx insn)
1868 {
1869   rtx set, dest;
1870   struct rtx_iv iv;
1871   struct iv_to_split *ivts;
1872   bool ok;
1873
1874   /* For now we just split the basic induction variables.  Later this may be
1875      extended for example by selecting also addresses of memory references.  */
1876   set = single_set (insn);
1877   if (!set)
1878     return NULL;
1879
1880   dest = SET_DEST (set);
1881   if (!REG_P (dest))
1882     return NULL;
1883
1884   if (!biv_p (insn, dest))
1885     return NULL;
1886
1887   ok = iv_analyze_result (insn, dest, &iv);
1888
1889   /* This used to be an assert under the assumption that if biv_p returns
1890      true that iv_analyze_result must also return true.  However, that
1891      assumption is not strictly correct as evidenced by pr25569.
1892
1893      Returning NULL when iv_analyze_result returns false is safe and
1894      avoids the problems in pr25569 until the iv_analyze_* routines
1895      can be fixed, which is apparently hard and time consuming
1896      according to their author.  */
1897   if (! ok)
1898     return NULL;
1899
1900   if (iv.step == const0_rtx
1901       || iv.mode != iv.extend_mode)
1902     return NULL;
1903
1904   /* Record the insn to split.  */
1905   ivts = XNEW (struct iv_to_split);
1906   ivts->insn = insn;
1907   ivts->orig_var = dest;
1908   ivts->base_var = NULL_RTX;
1909   ivts->step = iv.step;
1910   ivts->next = NULL;
1911   ivts->n_loc = 1;
1912   ivts->loc[0] = 1;
1913
1914   return ivts;
1915 }
1916
1917 /* Determines which of insns in LOOP can be optimized.
1918    Return a OPT_INFO struct with the relevant hash tables filled
1919    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1920    is undefined for the return value.  */
1921
1922 static struct opt_info *
1923 analyze_insns_in_loop (struct loop *loop)
1924 {
1925   basic_block *body, bb;
1926   unsigned i;
1927   struct opt_info *opt_info = XCNEW (struct opt_info);
1928   rtx insn;
1929   struct iv_to_split *ivts = NULL;
1930   struct var_to_expand *ves = NULL;
1931   iv_to_split **slot1;
1932   var_to_expand **slot2;
1933   vec<edge> edges = get_loop_exit_edges (loop);
1934   edge exit;
1935   bool can_apply = false;
1936
1937   iv_analysis_loop_init (loop);
1938
1939   body = get_loop_body (loop);
1940
1941   if (flag_split_ivs_in_unroller)
1942     {
1943       opt_info->insns_to_split.create (5 * loop->num_nodes);
1944       opt_info->iv_to_split_head = NULL;
1945       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1946     }
1947
1948   /* Record the loop exit bb and loop preheader before the unrolling.  */
1949   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1950
1951   if (edges.length () == 1)
1952     {
1953       exit = edges[0];
1954       if (!(exit->flags & EDGE_COMPLEX))
1955         {
1956           opt_info->loop_exit = split_edge (exit);
1957           can_apply = true;
1958         }
1959     }
1960
1961   if (flag_variable_expansion_in_unroller
1962       && can_apply)
1963     {
1964       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1965       opt_info->var_to_expand_head = NULL;
1966       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1967     }
1968
1969   for (i = 0; i < loop->num_nodes; i++)
1970     {
1971       bb = body[i];
1972       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1973         continue;
1974
1975       FOR_BB_INSNS (bb, insn)
1976       {
1977         if (!INSN_P (insn))
1978           continue;
1979
1980         if (opt_info->insns_to_split.is_created ())
1981           ivts = analyze_iv_to_split_insn (insn);
1982
1983         if (ivts)
1984           {
1985             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1986             gcc_assert (*slot1 == NULL);
1987             *slot1 = ivts;
1988             *opt_info->iv_to_split_tail = ivts;
1989             opt_info->iv_to_split_tail = &ivts->next;
1990             continue;
1991           }
1992
1993         if (opt_info->insns_with_var_to_expand.is_created ())
1994           ves = analyze_insn_to_expand_var (loop, insn);
1995
1996         if (ves)
1997           {
1998             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1999             gcc_assert (*slot2 == NULL);
2000             *slot2 = ves;
2001             *opt_info->var_to_expand_tail = ves;
2002             opt_info->var_to_expand_tail = &ves->next;
2003           }
2004       }
2005     }
2006
2007   edges.release ();
2008   free (body);
2009   return opt_info;
2010 }
2011
2012 /* Called just before loop duplication.  Records start of duplicated area
2013    to OPT_INFO.  */
2014
2015 static void
2016 opt_info_start_duplication (struct opt_info *opt_info)
2017 {
2018   if (opt_info)
2019     opt_info->first_new_block = last_basic_block;
2020 }
2021
2022 /* Determine the number of iterations between initialization of the base
2023    variable and the current copy (N_COPY).  N_COPIES is the total number
2024    of newly created copies.  UNROLLING is true if we are unrolling
2025    (not peeling) the loop.  */
2026
2027 static unsigned
2028 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2029 {
2030   if (unrolling)
2031     {
2032       /* If we are unrolling, initialization is done in the original loop
2033          body (number 0).  */
2034       return n_copy;
2035     }
2036   else
2037     {
2038       /* If we are peeling, the copy in that the initialization occurs has
2039          number 1.  The original loop (number 0) is the last.  */
2040       if (n_copy)
2041         return n_copy - 1;
2042       else
2043         return n_copies;
2044     }
2045 }
2046
2047 /* Locate in EXPR the expression corresponding to the location recorded
2048    in IVTS, and return a pointer to the RTX for this location.  */
2049
2050 static rtx *
2051 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2052 {
2053   unsigned i;
2054   rtx *ret = &expr;
2055
2056   for (i = 0; i < ivts->n_loc; i++)
2057     ret = &XEXP (*ret, ivts->loc[i]);
2058
2059   return ret;
2060 }
2061
2062 /* Allocate basic variable for the induction variable chain.  */
2063
2064 static void
2065 allocate_basic_variable (struct iv_to_split *ivts)
2066 {
2067   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2068
2069   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2070 }
2071
2072 /* Insert initialization of basic variable of IVTS before INSN, taking
2073    the initial value from INSN.  */
2074
2075 static void
2076 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2077 {
2078   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2079   rtx seq;
2080
2081   start_sequence ();
2082   expr = force_operand (expr, ivts->base_var);
2083   if (expr != ivts->base_var)
2084     emit_move_insn (ivts->base_var, expr);
2085   seq = get_insns ();
2086   end_sequence ();
2087
2088   emit_insn_before (seq, insn);
2089 }
2090
2091 /* Replace the use of induction variable described in IVTS in INSN
2092    by base variable + DELTA * step.  */
2093
2094 static void
2095 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2096 {
2097   rtx expr, *loc, seq, incr, var;
2098   enum machine_mode mode = GET_MODE (ivts->base_var);
2099   rtx src, dest, set;
2100
2101   /* Construct base + DELTA * step.  */
2102   if (!delta)
2103     expr = ivts->base_var;
2104   else
2105     {
2106       incr = simplify_gen_binary (MULT, mode,
2107                                   ivts->step, gen_int_mode (delta, mode));
2108       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2109                                   ivts->base_var, incr);
2110     }
2111
2112   /* Figure out where to do the replacement.  */
2113   loc = get_ivts_expr (single_set (insn), ivts);
2114
2115   /* If we can make the replacement right away, we're done.  */
2116   if (validate_change (insn, loc, expr, 0))
2117     return;
2118
2119   /* Otherwise, force EXPR into a register and try again.  */
2120   start_sequence ();
2121   var = gen_reg_rtx (mode);
2122   expr = force_operand (expr, var);
2123   if (expr != var)
2124     emit_move_insn (var, expr);
2125   seq = get_insns ();
2126   end_sequence ();
2127   emit_insn_before (seq, insn);
2128
2129   if (validate_change (insn, loc, var, 0))
2130     return;
2131
2132   /* The last chance.  Try recreating the assignment in insn
2133      completely from scratch.  */
2134   set = single_set (insn);
2135   gcc_assert (set);
2136
2137   start_sequence ();
2138   *loc = var;
2139   src = copy_rtx (SET_SRC (set));
2140   dest = copy_rtx (SET_DEST (set));
2141   src = force_operand (src, dest);
2142   if (src != dest)
2143     emit_move_insn (dest, src);
2144   seq = get_insns ();
2145   end_sequence ();
2146
2147   emit_insn_before (seq, insn);
2148   delete_insn (insn);
2149 }
2150
2151
2152 /* Return one expansion of the accumulator recorded in struct VE.  */
2153
2154 static rtx
2155 get_expansion (struct var_to_expand *ve)
2156 {
2157   rtx reg;
2158
2159   if (ve->reuse_expansion == 0)
2160     reg = ve->reg;
2161   else
2162     reg = ve->var_expansions[ve->reuse_expansion - 1];
2163
2164   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2165     ve->reuse_expansion = 0;
2166   else
2167     ve->reuse_expansion++;
2168
2169   return reg;
2170 }
2171
2172
2173 /* Given INSN replace the uses of the accumulator recorded in VE
2174    with a new register.  */
2175
2176 static void
2177 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2178 {
2179   rtx new_reg, set;
2180   bool really_new_expansion = false;
2181
2182   set = single_set (insn);
2183   gcc_assert (set);
2184
2185   /* Generate a new register only if the expansion limit has not been
2186      reached.  Else reuse an already existing expansion.  */
2187   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2188     {
2189       really_new_expansion = true;
2190       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2191     }
2192   else
2193     new_reg = get_expansion (ve);
2194
2195   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2196   if (apply_change_group ())
2197     if (really_new_expansion)
2198       {
2199         ve->var_expansions.safe_push (new_reg);
2200         ve->expansion_count++;
2201       }
2202 }
2203
2204 /* Initialize the variable expansions in loop preheader.  PLACE is the
2205    loop-preheader basic block where the initialization of the
2206    expansions should take place.  The expansions are initialized with
2207    (-0) when the operation is plus or minus to honor sign zero.  This
2208    way we can prevent cases where the sign of the final result is
2209    effected by the sign of the expansion.  Here is an example to
2210    demonstrate this:
2211
2212    for (i = 0 ; i < n; i++)
2213      sum += something;
2214
2215    ==>
2216
2217    sum += something
2218    ....
2219    i = i+1;
2220    sum1 += something
2221    ....
2222    i = i+1
2223    sum2 += something;
2224    ....
2225
2226    When SUM is initialized with -zero and SOMETHING is also -zero; the
2227    final result of sum should be -zero thus the expansions sum1 and sum2
2228    should be initialized with -zero as well (otherwise we will get +zero
2229    as the final result).  */
2230
2231 static void
2232 insert_var_expansion_initialization (struct var_to_expand *ve,
2233                                      basic_block place)
2234 {
2235   rtx seq, var, zero_init;
2236   unsigned i;
2237   enum machine_mode mode = GET_MODE (ve->reg);
2238   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2239
2240   if (ve->var_expansions.length () == 0)
2241     return;
2242
2243   start_sequence ();
2244   switch (ve->op)
2245     {
2246     case FMA:
2247       /* Note that we only accumulate FMA via the ADD operand.  */
2248     case PLUS:
2249     case MINUS:
2250       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2251         {
2252           if (honor_signed_zero_p)
2253             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2254           else
2255             zero_init = CONST0_RTX (mode);
2256           emit_move_insn (var, zero_init);
2257         }
2258       break;
2259
2260     case MULT:
2261       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2262         {
2263           zero_init = CONST1_RTX (GET_MODE (var));
2264           emit_move_insn (var, zero_init);
2265         }
2266       break;
2267
2268     default:
2269       gcc_unreachable ();
2270     }
2271
2272   seq = get_insns ();
2273   end_sequence ();
2274
2275   emit_insn_after (seq, BB_END (place));
2276 }
2277
2278 /* Combine the variable expansions at the loop exit.  PLACE is the
2279    loop exit basic block where the summation of the expansions should
2280    take place.  */
2281
2282 static void
2283 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2284 {
2285   rtx sum = ve->reg;
2286   rtx expr, seq, var, insn;
2287   unsigned i;
2288
2289   if (ve->var_expansions.length () == 0)
2290     return;
2291
2292   start_sequence ();
2293   switch (ve->op)
2294     {
2295     case FMA:
2296       /* Note that we only accumulate FMA via the ADD operand.  */
2297     case PLUS:
2298     case MINUS:
2299       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2300         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2301       break;
2302
2303     case MULT:
2304       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2305         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2306       break;
2307
2308     default:
2309       gcc_unreachable ();
2310     }
2311
2312   expr = force_operand (sum, ve->reg);
2313   if (expr != ve->reg)
2314     emit_move_insn (ve->reg, expr);
2315   seq = get_insns ();
2316   end_sequence ();
2317
2318   insn = BB_HEAD (place);
2319   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2320     insn = NEXT_INSN (insn);
2321
2322   emit_insn_after (seq, insn);
2323 }
2324
2325 /* Strip away REG_EQUAL notes for IVs we're splitting.
2326
2327    Updating REG_EQUAL notes for IVs we split is tricky: We
2328    cannot tell until after unrolling, DF-rescanning, and liveness
2329    updating, whether an EQ_USE is reached by the split IV while
2330    the IV reg is still live.  See PR55006.
2331
2332    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2333    because RTL loop-iv requires us to defer rescanning insns and
2334    any notes attached to them.  So resort to old techniques...  */
2335
2336 static void
2337 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2338 {
2339   struct iv_to_split *ivts;
2340   rtx note = find_reg_equal_equiv_note (insn);
2341   if (! note)
2342     return;
2343   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2344     if (reg_mentioned_p (ivts->orig_var, note))
2345       {
2346         remove_note (insn, note);
2347         return;
2348       }
2349 }
2350
2351 /* Apply loop optimizations in loop copies using the
2352    data which gathered during the unrolling.  Structure
2353    OPT_INFO record that data.
2354
2355    UNROLLING is true if we unrolled (not peeled) the loop.
2356    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2357    the loop (as it should happen in complete unrolling, but not in ordinary
2358    peeling of the loop).  */
2359
2360 static void
2361 apply_opt_in_copies (struct opt_info *opt_info,
2362                      unsigned n_copies, bool unrolling,
2363                      bool rewrite_original_loop)
2364 {
2365   unsigned i, delta;
2366   basic_block bb, orig_bb;
2367   rtx insn, orig_insn, next;
2368   struct iv_to_split ivts_templ, *ivts;
2369   struct var_to_expand ve_templ, *ves;
2370
2371   /* Sanity check -- we need to put initialization in the original loop
2372      body.  */
2373   gcc_assert (!unrolling || rewrite_original_loop);
2374
2375   /* Allocate the basic variables (i0).  */
2376   if (opt_info->insns_to_split.is_created ())
2377     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2378       allocate_basic_variable (ivts);
2379
2380   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2381     {
2382       bb = BASIC_BLOCK (i);
2383       orig_bb = get_bb_original (bb);
2384
2385       /* bb->aux holds position in copy sequence initialized by
2386          duplicate_loop_to_header_edge.  */
2387       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2388                                         unrolling);
2389       bb->aux = 0;
2390       orig_insn = BB_HEAD (orig_bb);
2391       FOR_BB_INSNS_SAFE (bb, insn, next)
2392         {
2393           if (!INSN_P (insn)
2394               || (DEBUG_INSN_P (insn)
2395                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2396             continue;
2397
2398           while (!INSN_P (orig_insn)
2399                  || (DEBUG_INSN_P (orig_insn)
2400                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2401                          == LABEL_DECL)))
2402             orig_insn = NEXT_INSN (orig_insn);
2403
2404           ivts_templ.insn = orig_insn;
2405           ve_templ.insn = orig_insn;
2406
2407           /* Apply splitting iv optimization.  */
2408           if (opt_info->insns_to_split.is_created ())
2409             {
2410               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2411
2412               ivts = opt_info->insns_to_split.find (&ivts_templ);
2413
2414               if (ivts)
2415                 {
2416                   gcc_assert (GET_CODE (PATTERN (insn))
2417                               == GET_CODE (PATTERN (orig_insn)));
2418
2419                   if (!delta)
2420                     insert_base_initialization (ivts, insn);
2421                   split_iv (ivts, insn, delta);
2422                 }
2423             }
2424           /* Apply variable expansion optimization.  */
2425           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2426             {
2427               ves = (struct var_to_expand *)
2428                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2429               if (ves)
2430                 {
2431                   gcc_assert (GET_CODE (PATTERN (insn))
2432                               == GET_CODE (PATTERN (orig_insn)));
2433                   expand_var_during_unrolling (ves, insn);
2434                 }
2435             }
2436           orig_insn = NEXT_INSN (orig_insn);
2437         }
2438     }
2439
2440   if (!rewrite_original_loop)
2441     return;
2442
2443   /* Initialize the variable expansions in the loop preheader
2444      and take care of combining them at the loop exit.  */
2445   if (opt_info->insns_with_var_to_expand.is_created ())
2446     {
2447       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2448         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2449       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2450         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2451     }
2452
2453   /* Rewrite also the original loop body.  Find them as originals of the blocks
2454      in the last copied iteration, i.e. those that have
2455      get_bb_copy (get_bb_original (bb)) == bb.  */
2456   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2457     {
2458       bb = BASIC_BLOCK (i);
2459       orig_bb = get_bb_original (bb);
2460       if (get_bb_copy (orig_bb) != bb)
2461         continue;
2462
2463       delta = determine_split_iv_delta (0, n_copies, unrolling);
2464       for (orig_insn = BB_HEAD (orig_bb);
2465            orig_insn != NEXT_INSN (BB_END (bb));
2466            orig_insn = next)
2467         {
2468           next = NEXT_INSN (orig_insn);
2469
2470           if (!INSN_P (orig_insn))
2471             continue;
2472
2473           ivts_templ.insn = orig_insn;
2474           if (opt_info->insns_to_split.is_created ())
2475             {
2476               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2477
2478               ivts = (struct iv_to_split *)
2479                 opt_info->insns_to_split.find (&ivts_templ);
2480               if (ivts)
2481                 {
2482                   if (!delta)
2483                     insert_base_initialization (ivts, orig_insn);
2484                   split_iv (ivts, orig_insn, delta);
2485                   continue;
2486                 }
2487             }
2488
2489         }
2490     }
2491 }
2492
2493 /* Release OPT_INFO.  */
2494
2495 static void
2496 free_opt_info (struct opt_info *opt_info)
2497 {
2498   if (opt_info->insns_to_split.is_created ())
2499     opt_info->insns_to_split.dispose ();
2500   if (opt_info->insns_with_var_to_expand.is_created ())
2501     {
2502       struct var_to_expand *ves;
2503
2504       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2505         ves->var_expansions.release ();
2506       opt_info->insns_with_var_to_expand.dispose ();
2507     }
2508   free (opt_info);
2509 }