gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82 };
  83
  84 /* Information about accumulators to expand.  */
  85
  86 struct var_to_expand
  87 {
  88   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  89   rtx reg;                         /* The accumulator which is expanded.  */
  90   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  91   struct var_to_expand *next;      /* Next entry in walking order.  */
  92   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  93                                       or multiplication.  */
  94   int expansion_count;             /* Count the number of expansions generated so far.  */
  95   int reuse_expansion;             /* The expansion we intend to reuse to expand
  96                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
  97                                       the original accumulator.  Else use
  98                                       var_expansions[REUSE_EXPANSION - 1].  */
  99 };
 100
 101 /* Hashtable helper for iv_to_split.  */
 102
 103 struct iv_split_hasher : typed_free_remove <iv_to_split>
 104 {
 105   typedef iv_to_split value_type;
 106   typedef iv_to_split compare_type;
 107   static inline hashval_t hash (const value_type *);
 108   static inline bool equal (const value_type *, const compare_type *);
 109 };
 110
 111
 112 /* A hash function for information about insns to split.  */
 113
 114 inline hashval_t
 115 iv_split_hasher::hash (const value_type *ivts)
 116 {
 117   return (hashval_t) INSN_UID (ivts->insn);
 118 }
 119
 120 /* An equality functions for information about insns to split.  */
 121
 122 inline bool
 123 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 124 {
 125   return i1->insn == i2->insn;
 126 }
 127
 128 /* Hashtable helper for iv_to_split.  */
 129
 130 struct var_expand_hasher : typed_free_remove <var_to_expand>
 131 {
 132   typedef var_to_expand value_type;
 133   typedef var_to_expand compare_type;
 134   static inline hashval_t hash (const value_type *);
 135   static inline bool equal (const value_type *, const compare_type *);
 136 };
 137
 138 /* Return a hash for VES.  */
 139
 140 inline hashval_t
 141 var_expand_hasher::hash (const value_type *ves)
 142 {
 143   return (hashval_t) INSN_UID (ves->insn);
 144 }
 145
 146 /* Return true if I1 and I2 refer to the same instruction.  */
 147
 148 inline bool
 149 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 150 {
 151   return i1->insn == i2->insn;
 152 }
 153
 154 /* Information about optimization applied in
 155    the unrolled loop.  */
 156
 157 struct opt_info
 158 {
 159   hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
 160                                                   split.  */
 161   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 162   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 163   hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
 164                                         insns with accumulators to expand.  */
 165   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 166   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 167   unsigned first_new_block;        /* The first basic block that was
 168                                       duplicated.  */
 169   basic_block loop_exit;           /* The loop exit basic block.  */
 170   basic_block loop_preheader;      /* The loop preheader basic block.  */
 171 };
 172
 173 static void decide_unrolling_and_peeling (int);
 174 static void peel_loops_completely (int);
 175 static void decide_peel_simple (struct loop *, int);
 176 static void decide_peel_once_rolling (struct loop *, int);
 177 static void decide_peel_completely (struct loop *, int);
 178 static void decide_unroll_stupid (struct loop *, int);
 179 static void decide_unroll_constant_iterations (struct loop *, int);
 180 static void decide_unroll_runtime_iterations (struct loop *, int);
 181 static void peel_loop_simple (struct loop *);
 182 static void peel_loop_completely (struct loop *);
 183 static void unroll_loop_stupid (struct loop *);
 184 static void unroll_loop_constant_iterations (struct loop *);
 185 static void unroll_loop_runtime_iterations (struct loop *);
 186 static struct opt_info *analyze_insns_in_loop (struct loop *);
 187 static void opt_info_start_duplication (struct opt_info *);
 188 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 189 static void free_opt_info (struct opt_info *);
 190 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 191 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 192 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 193 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 194 static void insert_var_expansion_initialization (struct var_to_expand *,
 195                                                  basic_block);
 196 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 197                                              basic_block);
 198 static rtx get_expansion (struct var_to_expand *);
 199
 200 /* Emit a message summarizing the unroll or peel that will be
 201    performed for LOOP, along with the loop's location LOCUS, if
 202    appropriate given the dump or -fopt-info settings.  */
 203
 204 static void
 205 report_unroll_peel (struct loop *loop, location_t locus)
 206 {
 207   struct niter_desc *desc;
 208   int niters = 0;
 209   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 210
 211   if (loop->lpt_decision.decision == LPT_NONE)
 212     return;
 213
 214   if (!dump_enabled_p ())
 215     return;
 216
 217   /* In the special case where the loop never iterated, emit
 218      a different message so that we don't report an unroll by 0.
 219      This matches the equivalent message emitted during tree unrolling.  */
 220   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 221       && !loop->lpt_decision.times)
 222     {
 223       dump_printf_loc (report_flags, locus,
 224                        "loop turned into non-loop; it never loops.\n");
 225       return;
 226     }
 227
 228   desc = get_simple_loop_desc (loop);
 229
 230   if (desc->const_iter)
 231     niters = desc->niter;
 232   else if (loop->header->count)
 233     niters = expected_loop_iterations (loop);
 234
 235   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 236     dump_printf_loc (report_flags, locus,
 237                      "loop with %d iterations completely unrolled",
 238                      loop->lpt_decision.times + 1);
 239   else
 240     dump_printf_loc (report_flags, locus,
 241                      "loop %s %d times",
 242                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 243                        ? "peeled" : "unrolled"),
 244                      loop->lpt_decision.times);
 245   if (profile_info)
 246     dump_printf (report_flags,
 247                  " (header execution count %d",
 248                  (int)loop->header->count);
 249   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 250     dump_printf (report_flags,
 251                  "%s%s iterations %d)",
 252                  profile_info ? ", " : " (",
 253                  desc->const_iter ? "const" : "average",
 254                  niters);
 255   else if (profile_info)
 256     dump_printf (report_flags, ")");
 257
 258   dump_printf (report_flags, "\n");
 259 }
 260
 261 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 262 void
 263 unroll_and_peel_loops (int flags)
 264 {
 265   struct loop *loop;
 266   bool changed = false;
 267
 268   /* First perform complete loop peeling (it is almost surely a win,
 269      and affects parameters for further decision a lot).  */
 270   peel_loops_completely (flags);
 271
 272   /* Now decide rest of unrolling and peeling.  */
 273   decide_unrolling_and_peeling (flags);
 274
 275   /* Scan the loops, inner ones first.  */
 276   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 277     {
 278       /* And perform the appropriate transformations.  */
 279       switch (loop->lpt_decision.decision)
 280         {
 281         case LPT_PEEL_COMPLETELY:
 282           /* Already done.  */
 283           gcc_unreachable ();
 284         case LPT_PEEL_SIMPLE:
 285           peel_loop_simple (loop);
 286           changed = true;
 287           break;
 288         case LPT_UNROLL_CONSTANT:
 289           unroll_loop_constant_iterations (loop);
 290           changed = true;
 291           break;
 292         case LPT_UNROLL_RUNTIME:
 293           unroll_loop_runtime_iterations (loop);
 294           changed = true;
 295           break;
 296         case LPT_UNROLL_STUPID:
 297           unroll_loop_stupid (loop);
 298           changed = true;
 299           break;
 300         case LPT_NONE:
 301           break;
 302         default:
 303           gcc_unreachable ();
 304         }
 305     }
 306
 307     if (changed)
 308       {
 309         calculate_dominance_info (CDI_DOMINATORS);
 310         fix_loop_structure (NULL);
 311       }
 312
 313   iv_analysis_done ();
 314 }
 315
 316 /* Check whether exit of the LOOP is at the end of loop body.  */
 317
 318 static bool
 319 loop_exit_at_end_p (struct loop *loop)
 320 {
 321   struct niter_desc *desc = get_simple_loop_desc (loop);
 322   rtx insn;
 323
 324   if (desc->in_edge->dest != loop->latch)
 325     return false;
 326
 327   /* Check that the latch is empty.  */
 328   FOR_BB_INSNS (loop->latch, insn)
 329     {
 330       if (NONDEBUG_INSN_P (insn))
 331         return false;
 332     }
 333
 334   return true;
 335 }
 336
 337 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 338 static void
 339 peel_loops_completely (int flags)
 340 {
 341   struct loop *loop;
 342   bool changed = false;
 343
 344   /* Scan the loops, the inner ones first.  */
 345   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 346     {
 347       loop->lpt_decision.decision = LPT_NONE;
 348       location_t locus = get_loop_location (loop);
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (TDF_RTL, locus,
 352                          ";; *** Considering loop %d at BB %d for "
 353                          "complete peeling ***\n",
 354                          loop->num, loop->header->index);
 355
 356       loop->ninsns = num_loop_insns (loop);
 357
 358       decide_peel_once_rolling (loop, flags);
 359       if (loop->lpt_decision.decision == LPT_NONE)
 360         decide_peel_completely (loop, flags);
 361
 362       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 363         {
 364           report_unroll_peel (loop, locus);
 365           peel_loop_completely (loop);
 366           changed = true;
 367         }
 368     }
 369
 370     if (changed)
 371       {
 372         calculate_dominance_info (CDI_DOMINATORS);
 373         fix_loop_structure (NULL);
 374       }
 375 }
 376
 377 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 378 static void
 379 decide_unrolling_and_peeling (int flags)
 380 {
 381   struct loop *loop;
 382
 383   /* Scan the loops, inner ones first.  */
 384   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 385     {
 386       loop->lpt_decision.decision = LPT_NONE;
 387       location_t locus = get_loop_location (loop);
 388
 389       if (dump_enabled_p ())
 390         dump_printf_loc (TDF_RTL, locus,
 391                          ";; *** Considering loop %d at BB %d for "
 392                          "unrolling and peeling ***\n",
 393                          loop->num, loop->header->index);
 394
 395       /* Do not peel cold areas.  */
 396       if (optimize_loop_for_size_p (loop))
 397         {
 398           if (dump_file)
 399             fprintf (dump_file, ";; Not considering loop, cold area\n");
 400           continue;
 401         }
 402
 403       /* Can the loop be manipulated?  */
 404       if (!can_duplicate_loop_p (loop))
 405         {
 406           if (dump_file)
 407             fprintf (dump_file,
 408                      ";; Not considering loop, cannot duplicate\n");
 409           continue;
 410         }
 411
 412       /* Skip non-innermost loops.  */
 413       if (loop->inner)
 414         {
 415           if (dump_file)
 416             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 417           continue;
 418         }
 419
 420       loop->ninsns = num_loop_insns (loop);
 421       loop->av_ninsns = average_num_loop_insns (loop);
 422
 423       /* Try transformations one by one in decreasing order of
 424          priority.  */
 425
 426       decide_unroll_constant_iterations (loop, flags);
 427       if (loop->lpt_decision.decision == LPT_NONE)
 428         decide_unroll_runtime_iterations (loop, flags);
 429       if (loop->lpt_decision.decision == LPT_NONE)
 430         decide_unroll_stupid (loop, flags);
 431       if (loop->lpt_decision.decision == LPT_NONE)
 432         decide_peel_simple (loop, flags);
 433
 434       report_unroll_peel (loop, locus);
 435     }
 436 }
 437
 438 /* Decide whether the LOOP is once rolling and suitable for complete
 439    peeling.  */
 440 static void
 441 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 442 {
 443   struct niter_desc *desc;
 444
 445   if (dump_file)
 446     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 447
 448   /* Is the loop small enough?  */
 449   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 450     {
 451       if (dump_file)
 452         fprintf (dump_file, ";; Not considering loop, is too big\n");
 453       return;
 454     }
 455
 456   /* Check for simple loops.  */
 457   desc = get_simple_loop_desc (loop);
 458
 459   /* Check number of iterations.  */
 460   if (!desc->simple_p
 461       || desc->assumptions
 462       || desc->infinite
 463       || !desc->const_iter
 464       || (desc->niter != 0
 465           && get_max_loop_iterations_int (loop) != 0))
 466     {
 467       if (dump_file)
 468         fprintf (dump_file,
 469                  ";; Unable to prove that the loop rolls exactly once\n");
 470       return;
 471     }
 472
 473   /* Success.  */
 474   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 475 }
 476
 477 /* Decide whether the LOOP is suitable for complete peeling.  */
 478 static void
 479 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 480 {
 481   unsigned npeel;
 482   struct niter_desc *desc;
 483
 484   if (dump_file)
 485     fprintf (dump_file, "\n;; Considering peeling completely\n");
 486
 487   /* Skip non-innermost loops.  */
 488   if (loop->inner)
 489     {
 490       if (dump_file)
 491         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 492       return;
 493     }
 494
 495   /* Do not peel cold areas.  */
 496   if (optimize_loop_for_size_p (loop))
 497     {
 498       if (dump_file)
 499         fprintf (dump_file, ";; Not considering loop, cold area\n");
 500       return;
 501     }
 502
 503   /* Can the loop be manipulated?  */
 504   if (!can_duplicate_loop_p (loop))
 505     {
 506       if (dump_file)
 507         fprintf (dump_file,
 508                  ";; Not considering loop, cannot duplicate\n");
 509       return;
 510     }
 511
 512   /* npeel = number of iterations to peel.  */
 513   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 514   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 515     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 516
 517   /* Is the loop small enough?  */
 518   if (!npeel)
 519     {
 520       if (dump_file)
 521         fprintf (dump_file, ";; Not considering loop, is too big\n");
 522       return;
 523     }
 524
 525   /* Check for simple loops.  */
 526   desc = get_simple_loop_desc (loop);
 527
 528   /* Check number of iterations.  */
 529   if (!desc->simple_p
 530       || desc->assumptions
 531       || !desc->const_iter
 532       || desc->infinite)
 533     {
 534       if (dump_file)
 535         fprintf (dump_file,
 536                  ";; Unable to prove that the loop iterates constant times\n");
 537       return;
 538     }
 539
 540   if (desc->niter > npeel - 1)
 541     {
 542       if (dump_file)
 543         {
 544           fprintf (dump_file,
 545                    ";; Not peeling loop completely, rolls too much (");
 546           fprintf (dump_file, "%"PRId64, desc->niter);
 547           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 548         }
 549       return;
 550     }
 551
 552   /* Success.  */
 553   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 554 }
 555
 556 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 557    completely.  The transformation done:
 558
 559    for (i = 0; i < 4; i++)
 560      body;
 561
 562    ==>
 563
 564    i = 0;
 565    body; i++;
 566    body; i++;
 567    body; i++;
 568    body; i++;
 569    */
 570 static void
 571 peel_loop_completely (struct loop *loop)
 572 {
 573   sbitmap wont_exit;
 574   unsigned HOST_WIDE_INT npeel;
 575   unsigned i;
 576   edge ein;
 577   struct niter_desc *desc = get_simple_loop_desc (loop);
 578   struct opt_info *opt_info = NULL;
 579
 580   npeel = desc->niter;
 581
 582   if (npeel)
 583     {
 584       bool ok;
 585
 586       wont_exit = sbitmap_alloc (npeel + 1);
 587       bitmap_ones (wont_exit);
 588       bitmap_clear_bit (wont_exit, 0);
 589       if (desc->noloop_assumptions)
 590         bitmap_clear_bit (wont_exit, 1);
 591
 592       auto_vec<edge> remove_edges;
 593       if (flag_split_ivs_in_unroller)
 594         opt_info = analyze_insns_in_loop (loop);
 595
 596       opt_info_start_duplication (opt_info);
 597       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 598                                           npeel,
 599                                           wont_exit, desc->out_edge,
 600                                           &remove_edges,
 601                                           DLTHE_FLAG_UPDATE_FREQ
 602                                           | DLTHE_FLAG_COMPLETTE_PEEL
 603                                           | (opt_info
 604                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 605       gcc_assert (ok);
 606
 607       free (wont_exit);
 608
 609       if (opt_info)
 610         {
 611           apply_opt_in_copies (opt_info, npeel, false, true);
 612           free_opt_info (opt_info);
 613         }
 614
 615       /* Remove the exit edges.  */
 616       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 617         remove_path (ein);
 618     }
 619
 620   ein = desc->in_edge;
 621   free_simple_loop_desc (loop);
 622
 623   /* Now remove the unreachable part of the last iteration and cancel
 624      the loop.  */
 625   remove_path (ein);
 626
 627   if (dump_file)
 628     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 629 }
 630
 631 /* Decide whether to unroll LOOP iterating constant number of times
 632    and how much.  */
 633
 634 static void
 635 decide_unroll_constant_iterations (struct loop *loop, int flags)
 636 {
 637   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 638   struct niter_desc *desc;
 639   widest_int iterations;
 640
 641   if (!(flags & UAP_UNROLL))
 642     {
 643       /* We were not asked to, just return back silently.  */
 644       return;
 645     }
 646
 647   if (dump_file)
 648     fprintf (dump_file,
 649              "\n;; Considering unrolling loop with constant "
 650              "number of iterations\n");
 651
 652   /* nunroll = total number of copies of the original loop body in
 653      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 654   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 655   nunroll_by_av
 656     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 657   if (nunroll > nunroll_by_av)
 658     nunroll = nunroll_by_av;
 659   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 660     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 661
 662   if (targetm.loop_unroll_adjust)
 663     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 664
 665   /* Skip big loops.  */
 666   if (nunroll <= 1)
 667     {
 668       if (dump_file)
 669         fprintf (dump_file, ";; Not considering loop, is too big\n");
 670       return;
 671     }
 672
 673   /* Check for simple loops.  */
 674   desc = get_simple_loop_desc (loop);
 675
 676   /* Check number of iterations.  */
 677   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 678     {
 679       if (dump_file)
 680         fprintf (dump_file,
 681                  ";; Unable to prove that the loop iterates constant times\n");
 682       return;
 683     }
 684
 685   /* Check whether the loop rolls enough to consider.
 686      Consult also loop bounds and profile; in the case the loop has more
 687      than one exit it may well loop less than determined maximal number
 688      of iterations.  */
 689   if (desc->niter < 2 * nunroll
 690       || ((get_estimated_loop_iterations (loop, &iterations)
 691            || get_max_loop_iterations (loop, &iterations))
 692           && wi::ltu_p (iterations, 2 * nunroll)))
 693     {
 694       if (dump_file)
 695         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 696       return;
 697     }
 698
 699   /* Success; now compute number of iterations to unroll.  We alter
 700      nunroll so that as few as possible copies of loop body are
 701      necessary, while still not decreasing the number of unrollings
 702      too much (at most by 1).  */
 703   best_copies = 2 * nunroll + 10;
 704
 705   i = 2 * nunroll + 2;
 706   if (i - 1 >= desc->niter)
 707     i = desc->niter - 2;
 708
 709   for (; i >= nunroll - 1; i--)
 710     {
 711       unsigned exit_mod = desc->niter % (i + 1);
 712
 713       if (!loop_exit_at_end_p (loop))
 714         n_copies = exit_mod + i + 1;
 715       else if (exit_mod != (unsigned) i
 716                || desc->noloop_assumptions != NULL_RTX)
 717         n_copies = exit_mod + i + 2;
 718       else
 719         n_copies = i + 1;
 720
 721       if (n_copies < best_copies)
 722         {
 723           best_copies = n_copies;
 724           best_unroll = i;
 725         }
 726     }
 727
 728   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 729   loop->lpt_decision.times = best_unroll;
 730 }
 731
 732 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 733    The transformation does this:
 734
 735    for (i = 0; i < 102; i++)
 736      body;
 737
 738    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 739
 740    i = 0;
 741    body; i++;
 742    body; i++;
 743    while (i < 102)
 744      {
 745        body; i++;
 746        body; i++;
 747        body; i++;
 748        body; i++;
 749      }
 750   */
 751 static void
 752 unroll_loop_constant_iterations (struct loop *loop)
 753 {
 754   unsigned HOST_WIDE_INT niter;
 755   unsigned exit_mod;
 756   sbitmap wont_exit;
 757   unsigned i;
 758   edge e;
 759   unsigned max_unroll = loop->lpt_decision.times;
 760   struct niter_desc *desc = get_simple_loop_desc (loop);
 761   bool exit_at_end = loop_exit_at_end_p (loop);
 762   struct opt_info *opt_info = NULL;
 763   bool ok;
 764
 765   niter = desc->niter;
 766
 767   /* Should not get here (such loop should be peeled instead).  */
 768   gcc_assert (niter > max_unroll + 1);
 769
 770   exit_mod = niter % (max_unroll + 1);
 771
 772   wont_exit = sbitmap_alloc (max_unroll + 1);
 773   bitmap_ones (wont_exit);
 774
 775   auto_vec<edge> remove_edges;
 776   if (flag_split_ivs_in_unroller
 777       || flag_variable_expansion_in_unroller)
 778     opt_info = analyze_insns_in_loop (loop);
 779
 780   if (!exit_at_end)
 781     {
 782       /* The exit is not at the end of the loop; leave exit test
 783          in the first copy, so that the loops that start with test
 784          of exit condition have continuous body after unrolling.  */
 785
 786       if (dump_file)
 787         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 788
 789       /* Peel exit_mod iterations.  */
 790       bitmap_clear_bit (wont_exit, 0);
 791       if (desc->noloop_assumptions)
 792         bitmap_clear_bit (wont_exit, 1);
 793
 794       if (exit_mod)
 795         {
 796           opt_info_start_duplication (opt_info);
 797           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 798                                               exit_mod,
 799                                               wont_exit, desc->out_edge,
 800                                               &remove_edges,
 801                                               DLTHE_FLAG_UPDATE_FREQ
 802                                               | (opt_info && exit_mod > 1
 803                                                  ? DLTHE_RECORD_COPY_NUMBER
 804                                                    : 0));
 805           gcc_assert (ok);
 806
 807           if (opt_info && exit_mod > 1)
 808             apply_opt_in_copies (opt_info, exit_mod, false, false);
 809
 810           desc->noloop_assumptions = NULL_RTX;
 811           desc->niter -= exit_mod;
 812           loop->nb_iterations_upper_bound -= exit_mod;
 813           if (loop->any_estimate
 814               && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
 815             loop->nb_iterations_estimate -= exit_mod;
 816           else
 817             loop->any_estimate = false;
 818         }
 819
 820       bitmap_set_bit (wont_exit, 1);
 821     }
 822   else
 823     {
 824       /* Leave exit test in last copy, for the same reason as above if
 825          the loop tests the condition at the end of loop body.  */
 826
 827       if (dump_file)
 828         fprintf (dump_file, ";; Condition at end of loop.\n");
 829
 830       /* We know that niter >= max_unroll + 2; so we do not need to care of
 831          case when we would exit before reaching the loop.  So just peel
 832          exit_mod + 1 iterations.  */
 833       if (exit_mod != max_unroll
 834           || desc->noloop_assumptions)
 835         {
 836           bitmap_clear_bit (wont_exit, 0);
 837           if (desc->noloop_assumptions)
 838             bitmap_clear_bit (wont_exit, 1);
 839
 840           opt_info_start_duplication (opt_info);
 841           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 842                                               exit_mod + 1,
 843                                               wont_exit, desc->out_edge,
 844                                               &remove_edges,
 845                                               DLTHE_FLAG_UPDATE_FREQ
 846                                               | (opt_info && exit_mod > 0
 847                                                  ? DLTHE_RECORD_COPY_NUMBER
 848                                                    : 0));
 849           gcc_assert (ok);
 850
 851           if (opt_info && exit_mod > 0)
 852             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 853
 854           desc->niter -= exit_mod + 1;
 855           loop->nb_iterations_upper_bound -= exit_mod + 1;
 856           if (loop->any_estimate
 857               && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
 858             loop->nb_iterations_estimate -= exit_mod + 1;
 859           else
 860             loop->any_estimate = false;
 861           desc->noloop_assumptions = NULL_RTX;
 862
 863           bitmap_set_bit (wont_exit, 0);
 864           bitmap_set_bit (wont_exit, 1);
 865         }
 866
 867       bitmap_clear_bit (wont_exit, max_unroll);
 868     }
 869
 870   /* Now unroll the loop.  */
 871
 872   opt_info_start_duplication (opt_info);
 873   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 874                                       max_unroll,
 875                                       wont_exit, desc->out_edge,
 876                                       &remove_edges,
 877                                       DLTHE_FLAG_UPDATE_FREQ
 878                                       | (opt_info
 879                                          ? DLTHE_RECORD_COPY_NUMBER
 880                                            : 0));
 881   gcc_assert (ok);
 882
 883   if (opt_info)
 884     {
 885       apply_opt_in_copies (opt_info, max_unroll, true, true);
 886       free_opt_info (opt_info);
 887     }
 888
 889   free (wont_exit);
 890
 891   if (exit_at_end)
 892     {
 893       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 894       /* Find a new in and out edge; they are in the last copy we have made.  */
 895
 896       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 897         {
 898           desc->out_edge = EDGE_SUCC (exit_block, 0);
 899           desc->in_edge = EDGE_SUCC (exit_block, 1);
 900         }
 901       else
 902         {
 903           desc->out_edge = EDGE_SUCC (exit_block, 1);
 904           desc->in_edge = EDGE_SUCC (exit_block, 0);
 905         }
 906     }
 907
 908   desc->niter /= max_unroll + 1;
 909   loop->nb_iterations_upper_bound
 910     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
 911   if (loop->any_estimate)
 912     loop->nb_iterations_estimate
 913       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
 914   desc->niter_expr = GEN_INT (desc->niter);
 915
 916   /* Remove the edges.  */
 917   FOR_EACH_VEC_ELT (remove_edges, i, e)
 918     remove_path (e);
 919
 920   if (dump_file)
 921     fprintf (dump_file,
 922              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 923              max_unroll, num_loop_insns (loop));
 924 }
 925
 926 /* Decide whether to unroll LOOP iterating runtime computable number of times
 927    and how much.  */
 928 static void
 929 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 930 {
 931   unsigned nunroll, nunroll_by_av, i;
 932   struct niter_desc *desc;
 933   widest_int iterations;
 934
 935   if (!(flags & UAP_UNROLL))
 936     {
 937       /* We were not asked to, just return back silently.  */
 938       return;
 939     }
 940
 941   if (dump_file)
 942     fprintf (dump_file,
 943              "\n;; Considering unrolling loop with runtime "
 944              "computable number of iterations\n");
 945
 946   /* nunroll = total number of copies of the original loop body in
 947      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 948   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 949   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 950   if (nunroll > nunroll_by_av)
 951     nunroll = nunroll_by_av;
 952   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 953     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 954
 955   if (targetm.loop_unroll_adjust)
 956     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 957
 958   /* Skip big loops.  */
 959   if (nunroll <= 1)
 960     {
 961       if (dump_file)
 962         fprintf (dump_file, ";; Not considering loop, is too big\n");
 963       return;
 964     }
 965
 966   /* Check for simple loops.  */
 967   desc = get_simple_loop_desc (loop);
 968
 969   /* Check simpleness.  */
 970   if (!desc->simple_p || desc->assumptions)
 971     {
 972       if (dump_file)
 973         fprintf (dump_file,
 974                  ";; Unable to prove that the number of iterations "
 975                  "can be counted in runtime\n");
 976       return;
 977     }
 978
 979   if (desc->const_iter)
 980     {
 981       if (dump_file)
 982         fprintf (dump_file, ";; Loop iterates constant times\n");
 983       return;
 984     }
 985
 986   /* Check whether the loop rolls.  */
 987   if ((get_estimated_loop_iterations (loop, &iterations)
 988        || get_max_loop_iterations (loop, &iterations))
 989       && wi::ltu_p (iterations, 2 * nunroll))
 990     {
 991       if (dump_file)
 992         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 993       return;
 994     }
 995
 996   /* Success; now force nunroll to be power of 2, as we are unable to
 997      cope with overflows in computation of number of iterations.  */
 998   for (i = 1; 2 * i <= nunroll; i *= 2)
 999     continue;
1000
1001   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1002   loop->lpt_decision.times = i - 1;
1003 }
1004
1005 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1006    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1007    and NULL is returned instead.  */
1008
1009 basic_block
1010 split_edge_and_insert (edge e, rtx insns)
1011 {
1012   basic_block bb;
1013
1014   if (!insns)
1015     return NULL;
1016   bb = split_edge (e);
1017   emit_insn_after (insns, BB_END (bb));
1018
1019   /* ??? We used to assume that INSNS can contain control flow insns, and
1020      that we had to try to find sub basic blocks in BB to maintain a valid
1021      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1022      and call break_superblocks when going out of cfglayout mode.  But it
1023      turns out that this never happens; and that if it does ever happen,
1024      the verify_flow_info at the end of the RTL loop passes would fail.
1025
1026      There are two reasons why we expected we could have control flow insns
1027      in INSNS.  The first is when a comparison has to be done in parts, and
1028      the second is when the number of iterations is computed for loops with
1029      the number of iterations known at runtime.  In both cases, test cases
1030      to get control flow in INSNS appear to be impossible to construct:
1031
1032       * If do_compare_rtx_and_jump needs several branches to do comparison
1033         in a mode that needs comparison by parts, we cannot analyze the
1034         number of iterations of the loop, and we never get to unrolling it.
1035
1036       * The code in expand_divmod that was suspected to cause creation of
1037         branching code seems to be only accessed for signed division.  The
1038         divisions used by # of iterations analysis are always unsigned.
1039         Problems might arise on architectures that emits branching code
1040         for some operations that may appear in the unroller (especially
1041         for division), but we have no such architectures.
1042
1043      Considering all this, it was decided that we should for now assume
1044      that INSNS can in theory contain control flow insns, but in practice
1045      it never does.  So we don't handle the theoretical case, and should
1046      a real failure ever show up, we have a pretty good clue for how to
1047      fix it.  */
1048
1049   return bb;
1050 }
1051
1052 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1053    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
1054    in order to create a jump.  */
1055
1056 static rtx
1057 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1058                       rtx cinsn)
1059 {
1060   rtx seq, jump, cond;
1061   enum machine_mode mode;
1062
1063   mode = GET_MODE (op0);
1064   if (mode == VOIDmode)
1065     mode = GET_MODE (op1);
1066
1067   start_sequence ();
1068   if (GET_MODE_CLASS (mode) == MODE_CC)
1069     {
1070       /* A hack -- there seems to be no easy generic way how to make a
1071          conditional jump from a ccmode comparison.  */
1072       gcc_assert (cinsn);
1073       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1074       gcc_assert (GET_CODE (cond) == comp);
1075       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1076       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1077       emit_jump_insn (copy_insn (PATTERN (cinsn)));
1078       jump = get_last_insn ();
1079       gcc_assert (JUMP_P (jump));
1080       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1081       LABEL_NUSES (JUMP_LABEL (jump))++;
1082       redirect_jump (jump, label, 0);
1083     }
1084   else
1085     {
1086       gcc_assert (!cinsn);
1087
1088       op0 = force_operand (op0, NULL_RTX);
1089       op1 = force_operand (op1, NULL_RTX);
1090       do_compare_rtx_and_jump (op0, op1, comp, 0,
1091                                mode, NULL_RTX, NULL_RTX, label, -1);
1092       jump = get_last_insn ();
1093       gcc_assert (JUMP_P (jump));
1094       JUMP_LABEL (jump) = label;
1095       LABEL_NUSES (label)++;
1096     }
1097   add_int_reg_note (jump, REG_BR_PROB, prob);
1098
1099   seq = get_insns ();
1100   end_sequence ();
1101
1102   return seq;
1103 }
1104
1105 /* Unroll LOOP for which we are able to count number of iterations in runtime
1106    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1107    extra care for case n < 0):
1108
1109    for (i = 0; i < n; i++)
1110      body;
1111
1112    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1113
1114    i = 0;
1115    mod = n % 4;
1116
1117    switch (mod)
1118      {
1119        case 3:
1120          body; i++;
1121        case 2:
1122          body; i++;
1123        case 1:
1124          body; i++;
1125        case 0: ;
1126      }
1127
1128    while (i < n)
1129      {
1130        body; i++;
1131        body; i++;
1132        body; i++;
1133        body; i++;
1134      }
1135    */
1136 static void
1137 unroll_loop_runtime_iterations (struct loop *loop)
1138 {
1139   rtx old_niter, niter, init_code, branch_code, tmp;
1140   unsigned i, j, p;
1141   basic_block preheader, *body, swtch, ezc_swtch;
1142   sbitmap wont_exit;
1143   int may_exit_copy;
1144   unsigned n_peel;
1145   edge e;
1146   bool extra_zero_check, last_may_exit;
1147   unsigned max_unroll = loop->lpt_decision.times;
1148   struct niter_desc *desc = get_simple_loop_desc (loop);
1149   bool exit_at_end = loop_exit_at_end_p (loop);
1150   struct opt_info *opt_info = NULL;
1151   bool ok;
1152
1153   if (flag_split_ivs_in_unroller
1154       || flag_variable_expansion_in_unroller)
1155     opt_info = analyze_insns_in_loop (loop);
1156
1157   /* Remember blocks whose dominators will have to be updated.  */
1158   auto_vec<basic_block> dom_bbs;
1159
1160   body = get_loop_body (loop);
1161   for (i = 0; i < loop->num_nodes; i++)
1162     {
1163       vec<basic_block> ldom;
1164       basic_block bb;
1165
1166       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1167       FOR_EACH_VEC_ELT (ldom, j, bb)
1168         if (!flow_bb_inside_loop_p (loop, bb))
1169           dom_bbs.safe_push (bb);
1170
1171       ldom.release ();
1172     }
1173   free (body);
1174
1175   if (!exit_at_end)
1176     {
1177       /* Leave exit in first copy (for explanation why see comment in
1178          unroll_loop_constant_iterations).  */
1179       may_exit_copy = 0;
1180       n_peel = max_unroll - 1;
1181       extra_zero_check = true;
1182       last_may_exit = false;
1183     }
1184   else
1185     {
1186       /* Leave exit in last copy (for explanation why see comment in
1187          unroll_loop_constant_iterations).  */
1188       may_exit_copy = max_unroll;
1189       n_peel = max_unroll;
1190       extra_zero_check = false;
1191       last_may_exit = true;
1192     }
1193
1194   /* Get expression for number of iterations.  */
1195   start_sequence ();
1196   old_niter = niter = gen_reg_rtx (desc->mode);
1197   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1198   if (tmp != niter)
1199     emit_move_insn (niter, tmp);
1200
1201   /* Count modulo by ANDing it with max_unroll; we use the fact that
1202      the number of unrollings is a power of two, and thus this is correct
1203      even if there is overflow in the computation.  */
1204   niter = expand_simple_binop (desc->mode, AND,
1205                                niter, gen_int_mode (max_unroll, desc->mode),
1206                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1207
1208   init_code = get_insns ();
1209   end_sequence ();
1210   unshare_all_rtl_in_chain (init_code);
1211
1212   /* Precondition the loop.  */
1213   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1214
1215   auto_vec<edge> remove_edges;
1216
1217   wont_exit = sbitmap_alloc (max_unroll + 2);
1218
1219   /* Peel the first copy of loop body (almost always we must leave exit test
1220      here; the only exception is when we have extra zero check and the number
1221      of iterations is reliable.  Also record the place of (possible) extra
1222      zero check.  */
1223   bitmap_clear (wont_exit);
1224   if (extra_zero_check
1225       && !desc->noloop_assumptions)
1226     bitmap_set_bit (wont_exit, 1);
1227   ezc_swtch = loop_preheader_edge (loop)->src;
1228   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1229                                       1, wont_exit, desc->out_edge,
1230                                       &remove_edges,
1231                                       DLTHE_FLAG_UPDATE_FREQ);
1232   gcc_assert (ok);
1233
1234   /* Record the place where switch will be built for preconditioning.  */
1235   swtch = split_edge (loop_preheader_edge (loop));
1236
1237   for (i = 0; i < n_peel; i++)
1238     {
1239       /* Peel the copy.  */
1240       bitmap_clear (wont_exit);
1241       if (i != n_peel - 1 || !last_may_exit)
1242         bitmap_set_bit (wont_exit, 1);
1243       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1244                                           1, wont_exit, desc->out_edge,
1245                                           &remove_edges,
1246                                           DLTHE_FLAG_UPDATE_FREQ);
1247       gcc_assert (ok);
1248
1249       /* Create item for switch.  */
1250       j = n_peel - i - (extra_zero_check ? 0 : 1);
1251       p = REG_BR_PROB_BASE / (i + 2);
1252
1253       preheader = split_edge (loop_preheader_edge (loop));
1254       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1255                                           block_label (preheader), p,
1256                                           NULL_RTX);
1257
1258       /* We rely on the fact that the compare and jump cannot be optimized out,
1259          and hence the cfg we create is correct.  */
1260       gcc_assert (branch_code != NULL_RTX);
1261
1262       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1263       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1264       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1265       e = make_edge (swtch, preheader,
1266                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1267       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1268       e->probability = p;
1269     }
1270
1271   if (extra_zero_check)
1272     {
1273       /* Add branch for zero iterations.  */
1274       p = REG_BR_PROB_BASE / (max_unroll + 1);
1275       swtch = ezc_swtch;
1276       preheader = split_edge (loop_preheader_edge (loop));
1277       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1278                                           block_label (preheader), p,
1279                                           NULL_RTX);
1280       gcc_assert (branch_code != NULL_RTX);
1281
1282       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1283       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1284       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1285       e = make_edge (swtch, preheader,
1286                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1287       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1288       e->probability = p;
1289     }
1290
1291   /* Recount dominators for outer blocks.  */
1292   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1293
1294   /* And unroll loop.  */
1295
1296   bitmap_ones (wont_exit);
1297   bitmap_clear_bit (wont_exit, may_exit_copy);
1298   opt_info_start_duplication (opt_info);
1299
1300   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1301                                       max_unroll,
1302                                       wont_exit, desc->out_edge,
1303                                       &remove_edges,
1304                                       DLTHE_FLAG_UPDATE_FREQ
1305                                       | (opt_info
1306                                          ? DLTHE_RECORD_COPY_NUMBER
1307                                            : 0));
1308   gcc_assert (ok);
1309
1310   if (opt_info)
1311     {
1312       apply_opt_in_copies (opt_info, max_unroll, true, true);
1313       free_opt_info (opt_info);
1314     }
1315
1316   free (wont_exit);
1317
1318   if (exit_at_end)
1319     {
1320       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1321       /* Find a new in and out edge; they are in the last copy we have
1322          made.  */
1323
1324       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1325         {
1326           desc->out_edge = EDGE_SUCC (exit_block, 0);
1327           desc->in_edge = EDGE_SUCC (exit_block, 1);
1328         }
1329       else
1330         {
1331           desc->out_edge = EDGE_SUCC (exit_block, 1);
1332           desc->in_edge = EDGE_SUCC (exit_block, 0);
1333         }
1334     }
1335
1336   /* Remove the edges.  */
1337   FOR_EACH_VEC_ELT (remove_edges, i, e)
1338     remove_path (e);
1339
1340   /* We must be careful when updating the number of iterations due to
1341      preconditioning and the fact that the value must be valid at entry
1342      of the loop.  After passing through the above code, we see that
1343      the correct new number of iterations is this:  */
1344   gcc_assert (!desc->const_iter);
1345   desc->niter_expr =
1346     simplify_gen_binary (UDIV, desc->mode, old_niter,
1347                          gen_int_mode (max_unroll + 1, desc->mode));
1348   loop->nb_iterations_upper_bound
1349     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1350   if (loop->any_estimate)
1351     loop->nb_iterations_estimate
1352       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1353   if (exit_at_end)
1354     {
1355       desc->niter_expr =
1356         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1357       desc->noloop_assumptions = NULL_RTX;
1358       --loop->nb_iterations_upper_bound;
1359       if (loop->any_estimate
1360           && loop->nb_iterations_estimate != 0)
1361         --loop->nb_iterations_estimate;
1362       else
1363         loop->any_estimate = false;
1364     }
1365
1366   if (dump_file)
1367     fprintf (dump_file,
1368              ";; Unrolled loop %d times, counting # of iterations "
1369              "in runtime, %i insns\n",
1370              max_unroll, num_loop_insns (loop));
1371 }
1372
1373 /* Decide whether to simply peel LOOP and how much.  */
1374 static void
1375 decide_peel_simple (struct loop *loop, int flags)
1376 {
1377   unsigned npeel;
1378   widest_int iterations;
1379
1380   if (!(flags & UAP_PEEL))
1381     {
1382       /* We were not asked to, just return back silently.  */
1383       return;
1384     }
1385
1386   if (dump_file)
1387     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1388
1389   /* npeel = number of iterations to peel.  */
1390   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1391   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1392     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1393
1394   /* Skip big loops.  */
1395   if (!npeel)
1396     {
1397       if (dump_file)
1398         fprintf (dump_file, ";; Not considering loop, is too big\n");
1399       return;
1400     }
1401
1402   /* Do not simply peel loops with branches inside -- it increases number
1403      of mispredicts.
1404      Exception is when we do have profile and we however have good chance
1405      to peel proper number of iterations loop will iterate in practice.
1406      TODO: this heuristic needs tunning; while for complette unrolling
1407      the branch inside loop mostly eliminates any improvements, for
1408      peeling it is not the case.  Also a function call inside loop is
1409      also branch from branch prediction POV (and probably better reason
1410      to not unroll/peel).  */
1411   if (num_loop_branches (loop) > 1
1412       && profile_status_for_fn (cfun) != PROFILE_READ)
1413     {
1414       if (dump_file)
1415         fprintf (dump_file, ";; Not peeling, contains branches\n");
1416       return;
1417     }
1418
1419   /* If we have realistic estimate on number of iterations, use it.  */
1420   if (get_estimated_loop_iterations (loop, &iterations))
1421     {
1422       if (wi::leu_p (npeel, iterations))
1423         {
1424           if (dump_file)
1425             {
1426               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1427               fprintf (dump_file, "%"PRId64,
1428                        (int64_t) (iterations.to_shwi () + 1));
1429               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1430                        npeel);
1431             }
1432           return;
1433         }
1434       npeel = iterations.to_shwi () + 1;
1435     }
1436   /* If we have small enough bound on iterations, we can still peel (completely
1437      unroll).  */
1438   else if (get_max_loop_iterations (loop, &iterations)
1439            && wi::ltu_p (iterations, npeel))
1440     npeel = iterations.to_shwi () + 1;
1441   else
1442     {
1443       /* For now we have no good heuristics to decide whether loop peeling
1444          will be effective, so disable it.  */
1445       if (dump_file)
1446         fprintf (dump_file,
1447                  ";; Not peeling loop, no evidence it will be profitable\n");
1448       return;
1449     }
1450
1451   /* Success.  */
1452   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1453   loop->lpt_decision.times = npeel;
1454 }
1455
1456 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1457
1458    while (cond)
1459      body;
1460
1461    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1462
1463    if (!cond) goto end;
1464    body;
1465    if (!cond) goto end;
1466    body;
1467    if (!cond) goto end;
1468    body;
1469    while (cond)
1470      body;
1471    end: ;
1472    */
1473 static void
1474 peel_loop_simple (struct loop *loop)
1475 {
1476   sbitmap wont_exit;
1477   unsigned npeel = loop->lpt_decision.times;
1478   struct niter_desc *desc = get_simple_loop_desc (loop);
1479   struct opt_info *opt_info = NULL;
1480   bool ok;
1481
1482   if (flag_split_ivs_in_unroller && npeel > 1)
1483     opt_info = analyze_insns_in_loop (loop);
1484
1485   wont_exit = sbitmap_alloc (npeel + 1);
1486   bitmap_clear (wont_exit);
1487
1488   opt_info_start_duplication (opt_info);
1489
1490   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1491                                       npeel, wont_exit, NULL,
1492                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1493                                       | (opt_info
1494                                          ? DLTHE_RECORD_COPY_NUMBER
1495                                            : 0));
1496   gcc_assert (ok);
1497
1498   free (wont_exit);
1499
1500   if (opt_info)
1501     {
1502       apply_opt_in_copies (opt_info, npeel, false, false);
1503       free_opt_info (opt_info);
1504     }
1505
1506   if (desc->simple_p)
1507     {
1508       if (desc->const_iter)
1509         {
1510           desc->niter -= npeel;
1511           desc->niter_expr = GEN_INT (desc->niter);
1512           desc->noloop_assumptions = NULL_RTX;
1513         }
1514       else
1515         {
1516           /* We cannot just update niter_expr, as its value might be clobbered
1517              inside loop.  We could handle this by counting the number into
1518              temporary just like we do in runtime unrolling, but it does not
1519              seem worthwhile.  */
1520           free_simple_loop_desc (loop);
1521         }
1522     }
1523   if (dump_file)
1524     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1525 }
1526
1527 /* Decide whether to unroll LOOP stupidly and how much.  */
1528 static void
1529 decide_unroll_stupid (struct loop *loop, int flags)
1530 {
1531   unsigned nunroll, nunroll_by_av, i;
1532   struct niter_desc *desc;
1533   widest_int iterations;
1534
1535   if (!(flags & UAP_UNROLL_ALL))
1536     {
1537       /* We were not asked to, just return back silently.  */
1538       return;
1539     }
1540
1541   if (dump_file)
1542     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1543
1544   /* nunroll = total number of copies of the original loop body in
1545      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1546   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1547   nunroll_by_av
1548     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1549   if (nunroll > nunroll_by_av)
1550     nunroll = nunroll_by_av;
1551   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1552     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1553
1554   if (targetm.loop_unroll_adjust)
1555     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1556
1557   /* Skip big loops.  */
1558   if (nunroll <= 1)
1559     {
1560       if (dump_file)
1561         fprintf (dump_file, ";; Not considering loop, is too big\n");
1562       return;
1563     }
1564
1565   /* Check for simple loops.  */
1566   desc = get_simple_loop_desc (loop);
1567
1568   /* Check simpleness.  */
1569   if (desc->simple_p && !desc->assumptions)
1570     {
1571       if (dump_file)
1572         fprintf (dump_file, ";; The loop is simple\n");
1573       return;
1574     }
1575
1576   /* Do not unroll loops with branches inside -- it increases number
1577      of mispredicts.
1578      TODO: this heuristic needs tunning; call inside the loop body
1579      is also relatively good reason to not unroll.  */
1580   if (num_loop_branches (loop) > 1)
1581     {
1582       if (dump_file)
1583         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1584       return;
1585     }
1586
1587   /* Check whether the loop rolls.  */
1588   if ((get_estimated_loop_iterations (loop, &iterations)
1589        || get_max_loop_iterations (loop, &iterations))
1590       && wi::ltu_p (iterations, 2 * nunroll))
1591     {
1592       if (dump_file)
1593         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1594       return;
1595     }
1596
1597   /* Success.  Now force nunroll to be power of 2, as it seems that this
1598      improves results (partially because of better alignments, partially
1599      because of some dark magic).  */
1600   for (i = 1; 2 * i <= nunroll; i *= 2)
1601     continue;
1602
1603   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1604   loop->lpt_decision.times = i - 1;
1605 }
1606
1607 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1608
1609    while (cond)
1610      body;
1611
1612    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1613
1614    while (cond)
1615      {
1616        body;
1617        if (!cond) break;
1618        body;
1619        if (!cond) break;
1620        body;
1621        if (!cond) break;
1622        body;
1623      }
1624    */
1625 static void
1626 unroll_loop_stupid (struct loop *loop)
1627 {
1628   sbitmap wont_exit;
1629   unsigned nunroll = loop->lpt_decision.times;
1630   struct niter_desc *desc = get_simple_loop_desc (loop);
1631   struct opt_info *opt_info = NULL;
1632   bool ok;
1633
1634   if (flag_split_ivs_in_unroller
1635       || flag_variable_expansion_in_unroller)
1636     opt_info = analyze_insns_in_loop (loop);
1637
1638
1639   wont_exit = sbitmap_alloc (nunroll + 1);
1640   bitmap_clear (wont_exit);
1641   opt_info_start_duplication (opt_info);
1642
1643   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1644                                       nunroll, wont_exit,
1645                                       NULL, NULL,
1646                                       DLTHE_FLAG_UPDATE_FREQ
1647                                       | (opt_info
1648                                          ? DLTHE_RECORD_COPY_NUMBER
1649                                            : 0));
1650   gcc_assert (ok);
1651
1652   if (opt_info)
1653     {
1654       apply_opt_in_copies (opt_info, nunroll, true, true);
1655       free_opt_info (opt_info);
1656     }
1657
1658   free (wont_exit);
1659
1660   if (desc->simple_p)
1661     {
1662       /* We indeed may get here provided that there are nontrivial assumptions
1663          for a loop to be really simple.  We could update the counts, but the
1664          problem is that we are unable to decide which exit will be taken
1665          (not really true in case the number of iterations is constant,
1666          but no one will do anything with this information, so we do not
1667          worry about it).  */
1668       desc->simple_p = false;
1669     }
1670
1671   if (dump_file)
1672     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1673              nunroll, num_loop_insns (loop));
1674 }
1675
1676 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1677    Set *DEBUG_USES to the number of debug insns that reference the
1678    variable.  */
1679
1680 bool
1681 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1682                                   int *debug_uses)
1683 {
1684   basic_block *body, bb;
1685   unsigned i;
1686   int count_ref = 0;
1687   rtx insn;
1688
1689   body = get_loop_body (loop);
1690   for (i = 0; i < loop->num_nodes; i++)
1691     {
1692       bb = body[i];
1693
1694       FOR_BB_INSNS (bb, insn)
1695         if (!rtx_referenced_p (reg, insn))
1696           continue;
1697         else if (DEBUG_INSN_P (insn))
1698           ++*debug_uses;
1699         else if (++count_ref > 1)
1700           break;
1701     }
1702   free (body);
1703   return (count_ref  == 1);
1704 }
1705
1706 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1707
1708 static void
1709 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1710 {
1711   basic_block *body, bb;
1712   unsigned i;
1713   rtx insn;
1714
1715   body = get_loop_body (loop);
1716   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1717     {
1718       bb = body[i];
1719
1720       FOR_BB_INSNS (bb, insn)
1721         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1722           continue;
1723         else
1724           {
1725             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1726                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1727             if (!--debug_uses)
1728               break;
1729           }
1730     }
1731   free (body);
1732 }
1733
1734 /* Determine whether INSN contains an accumulator
1735    which can be expanded into separate copies,
1736    one for each copy of the LOOP body.
1737
1738    for (i = 0 ; i < n; i++)
1739      sum += a[i];
1740
1741    ==>
1742
1743    sum += a[i]
1744    ....
1745    i = i+1;
1746    sum1 += a[i]
1747    ....
1748    i = i+1
1749    sum2 += a[i];
1750    ....
1751
1752    Return NULL if INSN contains no opportunity for expansion of accumulator.
1753    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1754    information and return a pointer to it.
1755 */
1756
1757 static struct var_to_expand *
1758 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1759 {
1760   rtx set, dest, src;
1761   struct var_to_expand *ves;
1762   unsigned accum_pos;
1763   enum rtx_code code;
1764   int debug_uses = 0;
1765
1766   set = single_set (insn);
1767   if (!set)
1768     return NULL;
1769
1770   dest = SET_DEST (set);
1771   src = SET_SRC (set);
1772   code = GET_CODE (src);
1773
1774   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1775     return NULL;
1776
1777   if (FLOAT_MODE_P (GET_MODE (dest)))
1778     {
1779       if (!flag_associative_math)
1780         return NULL;
1781       /* In the case of FMA, we're also changing the rounding.  */
1782       if (code == FMA && !flag_unsafe_math_optimizations)
1783         return NULL;
1784     }
1785
1786   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1787      in MD.  But if there is no optab to generate the insn, we can not
1788      perform the variable expansion.  This can happen if an MD provides
1789      an insn but not a named pattern to generate it, for example to avoid
1790      producing code that needs additional mode switches like for x87/mmx.
1791
1792      So we check have_insn_for which looks for an optab for the operation
1793      in SRC.  If it doesn't exist, we can't perform the expansion even
1794      though INSN is valid.  */
1795   if (!have_insn_for (code, GET_MODE (src)))
1796     return NULL;
1797
1798   if (!REG_P (dest)
1799       && !(GET_CODE (dest) == SUBREG
1800            && REG_P (SUBREG_REG (dest))))
1801     return NULL;
1802
1803   /* Find the accumulator use within the operation.  */
1804   if (code == FMA)
1805     {
1806       /* We only support accumulation via FMA in the ADD position.  */
1807       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1808         return NULL;
1809       accum_pos = 2;
1810     }
1811   else if (rtx_equal_p (dest, XEXP (src, 0)))
1812     accum_pos = 0;
1813   else if (rtx_equal_p (dest, XEXP (src, 1)))
1814     {
1815       /* The method of expansion that we are using; which includes the
1816          initialization of the expansions with zero and the summation of
1817          the expansions at the end of the computation will yield wrong
1818          results for (x = something - x) thus avoid using it in that case.  */
1819       if (code == MINUS)
1820         return NULL;
1821       accum_pos = 1;
1822     }
1823   else
1824     return NULL;
1825
1826   /* It must not otherwise be used.  */
1827   if (code == FMA)
1828     {
1829       if (rtx_referenced_p (dest, XEXP (src, 0))
1830           || rtx_referenced_p (dest, XEXP (src, 1)))
1831         return NULL;
1832     }
1833   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1834     return NULL;
1835
1836   /* It must be used in exactly one insn.  */
1837   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1838     return NULL;
1839
1840   if (dump_file)
1841     {
1842       fprintf (dump_file, "\n;; Expanding Accumulator ");
1843       print_rtl (dump_file, dest);
1844       fprintf (dump_file, "\n");
1845     }
1846
1847   if (debug_uses)
1848     /* Instead of resetting the debug insns, we could replace each
1849        debug use in the loop with the sum or product of all expanded
1850        accummulators.  Since we'll only know of all expansions at the
1851        end, we'd have to keep track of which vars_to_expand a debug
1852        insn in the loop references, take note of each copy of the
1853        debug insn during unrolling, and when it's all done, compute
1854        the sum or product of each variable and adjust the original
1855        debug insn and each copy thereof.  What a pain!  */
1856     reset_debug_uses_in_loop (loop, dest, debug_uses);
1857
1858   /* Record the accumulator to expand.  */
1859   ves = XNEW (struct var_to_expand);
1860   ves->insn = insn;
1861   ves->reg = copy_rtx (dest);
1862   ves->var_expansions.create (1);
1863   ves->next = NULL;
1864   ves->op = GET_CODE (src);
1865   ves->expansion_count = 0;
1866   ves->reuse_expansion = 0;
1867   return ves;
1868 }
1869
1870 /* Determine whether there is an induction variable in INSN that
1871    we would like to split during unrolling.
1872
1873    I.e. replace
1874
1875    i = i + 1;
1876    ...
1877    i = i + 1;
1878    ...
1879    i = i + 1;
1880    ...
1881
1882    type chains by
1883
1884    i0 = i + 1
1885    ...
1886    i = i0 + 1
1887    ...
1888    i = i0 + 2
1889    ...
1890
1891    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1892    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1893    pointer to it.  */
1894
1895 static struct iv_to_split *
1896 analyze_iv_to_split_insn (rtx insn)
1897 {
1898   rtx set, dest;
1899   struct rtx_iv iv;
1900   struct iv_to_split *ivts;
1901   bool ok;
1902
1903   /* For now we just split the basic induction variables.  Later this may be
1904      extended for example by selecting also addresses of memory references.  */
1905   set = single_set (insn);
1906   if (!set)
1907     return NULL;
1908
1909   dest = SET_DEST (set);
1910   if (!REG_P (dest))
1911     return NULL;
1912
1913   if (!biv_p (insn, dest))
1914     return NULL;
1915
1916   ok = iv_analyze_result (insn, dest, &iv);
1917
1918   /* This used to be an assert under the assumption that if biv_p returns
1919      true that iv_analyze_result must also return true.  However, that
1920      assumption is not strictly correct as evidenced by pr25569.
1921
1922      Returning NULL when iv_analyze_result returns false is safe and
1923      avoids the problems in pr25569 until the iv_analyze_* routines
1924      can be fixed, which is apparently hard and time consuming
1925      according to their author.  */
1926   if (! ok)
1927     return NULL;
1928
1929   if (iv.step == const0_rtx
1930       || iv.mode != iv.extend_mode)
1931     return NULL;
1932
1933   /* Record the insn to split.  */
1934   ivts = XNEW (struct iv_to_split);
1935   ivts->insn = insn;
1936   ivts->orig_var = dest;
1937   ivts->base_var = NULL_RTX;
1938   ivts->step = iv.step;
1939   ivts->next = NULL;
1940
1941   return ivts;
1942 }
1943
1944 /* Determines which of insns in LOOP can be optimized.
1945    Return a OPT_INFO struct with the relevant hash tables filled
1946    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1947    is undefined for the return value.  */
1948
1949 static struct opt_info *
1950 analyze_insns_in_loop (struct loop *loop)
1951 {
1952   basic_block *body, bb;
1953   unsigned i;
1954   struct opt_info *opt_info = XCNEW (struct opt_info);
1955   rtx insn;
1956   struct iv_to_split *ivts = NULL;
1957   struct var_to_expand *ves = NULL;
1958   iv_to_split **slot1;
1959   var_to_expand **slot2;
1960   vec<edge> edges = get_loop_exit_edges (loop);
1961   edge exit;
1962   bool can_apply = false;
1963
1964   iv_analysis_loop_init (loop);
1965
1966   body = get_loop_body (loop);
1967
1968   if (flag_split_ivs_in_unroller)
1969     {
1970       opt_info->insns_to_split
1971         = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1972       opt_info->iv_to_split_head = NULL;
1973       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1974     }
1975
1976   /* Record the loop exit bb and loop preheader before the unrolling.  */
1977   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1978
1979   if (edges.length () == 1)
1980     {
1981       exit = edges[0];
1982       if (!(exit->flags & EDGE_COMPLEX))
1983         {
1984           opt_info->loop_exit = split_edge (exit);
1985           can_apply = true;
1986         }
1987     }
1988
1989   if (flag_variable_expansion_in_unroller
1990       && can_apply)
1991     {
1992       opt_info->insns_with_var_to_expand
1993         = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1994       opt_info->var_to_expand_head = NULL;
1995       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1996     }
1997
1998   for (i = 0; i < loop->num_nodes; i++)
1999     {
2000       bb = body[i];
2001       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2002         continue;
2003
2004       FOR_BB_INSNS (bb, insn)
2005       {
2006         if (!INSN_P (insn))
2007           continue;
2008
2009         if (opt_info->insns_to_split)
2010           ivts = analyze_iv_to_split_insn (insn);
2011
2012         if (ivts)
2013           {
2014             slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
2015             gcc_assert (*slot1 == NULL);
2016             *slot1 = ivts;
2017             *opt_info->iv_to_split_tail = ivts;
2018             opt_info->iv_to_split_tail = &ivts->next;
2019             continue;
2020           }
2021
2022         if (opt_info->insns_with_var_to_expand)
2023           ves = analyze_insn_to_expand_var (loop, insn);
2024
2025         if (ves)
2026           {
2027             slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
2028             gcc_assert (*slot2 == NULL);
2029             *slot2 = ves;
2030             *opt_info->var_to_expand_tail = ves;
2031             opt_info->var_to_expand_tail = &ves->next;
2032           }
2033       }
2034     }
2035
2036   edges.release ();
2037   free (body);
2038   return opt_info;
2039 }
2040
2041 /* Called just before loop duplication.  Records start of duplicated area
2042    to OPT_INFO.  */
2043
2044 static void
2045 opt_info_start_duplication (struct opt_info *opt_info)
2046 {
2047   if (opt_info)
2048     opt_info->first_new_block = last_basic_block_for_fn (cfun);
2049 }
2050
2051 /* Determine the number of iterations between initialization of the base
2052    variable and the current copy (N_COPY).  N_COPIES is the total number
2053    of newly created copies.  UNROLLING is true if we are unrolling
2054    (not peeling) the loop.  */
2055
2056 static unsigned
2057 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2058 {
2059   if (unrolling)
2060     {
2061       /* If we are unrolling, initialization is done in the original loop
2062          body (number 0).  */
2063       return n_copy;
2064     }
2065   else
2066     {
2067       /* If we are peeling, the copy in that the initialization occurs has
2068          number 1.  The original loop (number 0) is the last.  */
2069       if (n_copy)
2070         return n_copy - 1;
2071       else
2072         return n_copies;
2073     }
2074 }
2075
2076 /* Allocate basic variable for the induction variable chain.  */
2077
2078 static void
2079 allocate_basic_variable (struct iv_to_split *ivts)
2080 {
2081   rtx expr = SET_SRC (single_set (ivts->insn));
2082
2083   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2084 }
2085
2086 /* Insert initialization of basic variable of IVTS before INSN, taking
2087    the initial value from INSN.  */
2088
2089 static void
2090 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2091 {
2092   rtx expr = copy_rtx (SET_SRC (single_set (insn)));
2093   rtx seq;
2094
2095   start_sequence ();
2096   expr = force_operand (expr, ivts->base_var);
2097   if (expr != ivts->base_var)
2098     emit_move_insn (ivts->base_var, expr);
2099   seq = get_insns ();
2100   end_sequence ();
2101
2102   emit_insn_before (seq, insn);
2103 }
2104
2105 /* Replace the use of induction variable described in IVTS in INSN
2106    by base variable + DELTA * step.  */
2107
2108 static void
2109 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2110 {
2111   rtx expr, *loc, seq, incr, var;
2112   enum machine_mode mode = GET_MODE (ivts->base_var);
2113   rtx src, dest, set;
2114
2115   /* Construct base + DELTA * step.  */
2116   if (!delta)
2117     expr = ivts->base_var;
2118   else
2119     {
2120       incr = simplify_gen_binary (MULT, mode,
2121                                   ivts->step, gen_int_mode (delta, mode));
2122       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2123                                   ivts->base_var, incr);
2124     }
2125
2126   /* Figure out where to do the replacement.  */
2127   loc = &SET_SRC (single_set (insn));
2128
2129   /* If we can make the replacement right away, we're done.  */
2130   if (validate_change (insn, loc, expr, 0))
2131     return;
2132
2133   /* Otherwise, force EXPR into a register and try again.  */
2134   start_sequence ();
2135   var = gen_reg_rtx (mode);
2136   expr = force_operand (expr, var);
2137   if (expr != var)
2138     emit_move_insn (var, expr);
2139   seq = get_insns ();
2140   end_sequence ();
2141   emit_insn_before (seq, insn);
2142
2143   if (validate_change (insn, loc, var, 0))
2144     return;
2145
2146   /* The last chance.  Try recreating the assignment in insn
2147      completely from scratch.  */
2148   set = single_set (insn);
2149   gcc_assert (set);
2150
2151   start_sequence ();
2152   *loc = var;
2153   src = copy_rtx (SET_SRC (set));
2154   dest = copy_rtx (SET_DEST (set));
2155   src = force_operand (src, dest);
2156   if (src != dest)
2157     emit_move_insn (dest, src);
2158   seq = get_insns ();
2159   end_sequence ();
2160
2161   emit_insn_before (seq, insn);
2162   delete_insn (insn);
2163 }
2164
2165
2166 /* Return one expansion of the accumulator recorded in struct VE.  */
2167
2168 static rtx
2169 get_expansion (struct var_to_expand *ve)
2170 {
2171   rtx reg;
2172
2173   if (ve->reuse_expansion == 0)
2174     reg = ve->reg;
2175   else
2176     reg = ve->var_expansions[ve->reuse_expansion - 1];
2177
2178   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2179     ve->reuse_expansion = 0;
2180   else
2181     ve->reuse_expansion++;
2182
2183   return reg;
2184 }
2185
2186
2187 /* Given INSN replace the uses of the accumulator recorded in VE
2188    with a new register.  */
2189
2190 static void
2191 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2192 {
2193   rtx new_reg, set;
2194   bool really_new_expansion = false;
2195
2196   set = single_set (insn);
2197   gcc_assert (set);
2198
2199   /* Generate a new register only if the expansion limit has not been
2200      reached.  Else reuse an already existing expansion.  */
2201   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2202     {
2203       really_new_expansion = true;
2204       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2205     }
2206   else
2207     new_reg = get_expansion (ve);
2208
2209   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2210   if (apply_change_group ())
2211     if (really_new_expansion)
2212       {
2213         ve->var_expansions.safe_push (new_reg);
2214         ve->expansion_count++;
2215       }
2216 }
2217
2218 /* Initialize the variable expansions in loop preheader.  PLACE is the
2219    loop-preheader basic block where the initialization of the
2220    expansions should take place.  The expansions are initialized with
2221    (-0) when the operation is plus or minus to honor sign zero.  This
2222    way we can prevent cases where the sign of the final result is
2223    effected by the sign of the expansion.  Here is an example to
2224    demonstrate this:
2225
2226    for (i = 0 ; i < n; i++)
2227      sum += something;
2228
2229    ==>
2230
2231    sum += something
2232    ....
2233    i = i+1;
2234    sum1 += something
2235    ....
2236    i = i+1
2237    sum2 += something;
2238    ....
2239
2240    When SUM is initialized with -zero and SOMETHING is also -zero; the
2241    final result of sum should be -zero thus the expansions sum1 and sum2
2242    should be initialized with -zero as well (otherwise we will get +zero
2243    as the final result).  */
2244
2245 static void
2246 insert_var_expansion_initialization (struct var_to_expand *ve,
2247                                      basic_block place)
2248 {
2249   rtx seq, var, zero_init;
2250   unsigned i;
2251   enum machine_mode mode = GET_MODE (ve->reg);
2252   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2253
2254   if (ve->var_expansions.length () == 0)
2255     return;
2256
2257   start_sequence ();
2258   switch (ve->op)
2259     {
2260     case FMA:
2261       /* Note that we only accumulate FMA via the ADD operand.  */
2262     case PLUS:
2263     case MINUS:
2264       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2265         {
2266           if (honor_signed_zero_p)
2267             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2268           else
2269             zero_init = CONST0_RTX (mode);
2270           emit_move_insn (var, zero_init);
2271         }
2272       break;
2273
2274     case MULT:
2275       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2276         {
2277           zero_init = CONST1_RTX (GET_MODE (var));
2278           emit_move_insn (var, zero_init);
2279         }
2280       break;
2281
2282     default:
2283       gcc_unreachable ();
2284     }
2285
2286   seq = get_insns ();
2287   end_sequence ();
2288
2289   emit_insn_after (seq, BB_END (place));
2290 }
2291
2292 /* Combine the variable expansions at the loop exit.  PLACE is the
2293    loop exit basic block where the summation of the expansions should
2294    take place.  */
2295
2296 static void
2297 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2298 {
2299   rtx sum = ve->reg;
2300   rtx expr, seq, var, insn;
2301   unsigned i;
2302
2303   if (ve->var_expansions.length () == 0)
2304     return;
2305
2306   start_sequence ();
2307   switch (ve->op)
2308     {
2309     case FMA:
2310       /* Note that we only accumulate FMA via the ADD operand.  */
2311     case PLUS:
2312     case MINUS:
2313       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2314         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2315       break;
2316
2317     case MULT:
2318       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2319         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2320       break;
2321
2322     default:
2323       gcc_unreachable ();
2324     }
2325
2326   expr = force_operand (sum, ve->reg);
2327   if (expr != ve->reg)
2328     emit_move_insn (ve->reg, expr);
2329   seq = get_insns ();
2330   end_sequence ();
2331
2332   insn = BB_HEAD (place);
2333   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2334     insn = NEXT_INSN (insn);
2335
2336   emit_insn_after (seq, insn);
2337 }
2338
2339 /* Strip away REG_EQUAL notes for IVs we're splitting.
2340
2341    Updating REG_EQUAL notes for IVs we split is tricky: We
2342    cannot tell until after unrolling, DF-rescanning, and liveness
2343    updating, whether an EQ_USE is reached by the split IV while
2344    the IV reg is still live.  See PR55006.
2345
2346    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2347    because RTL loop-iv requires us to defer rescanning insns and
2348    any notes attached to them.  So resort to old techniques...  */
2349
2350 static void
2351 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2352 {
2353   struct iv_to_split *ivts;
2354   rtx note = find_reg_equal_equiv_note (insn);
2355   if (! note)
2356     return;
2357   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2358     if (reg_mentioned_p (ivts->orig_var, note))
2359       {
2360         remove_note (insn, note);
2361         return;
2362       }
2363 }
2364
2365 /* Apply loop optimizations in loop copies using the
2366    data which gathered during the unrolling.  Structure
2367    OPT_INFO record that data.
2368
2369    UNROLLING is true if we unrolled (not peeled) the loop.
2370    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2371    the loop (as it should happen in complete unrolling, but not in ordinary
2372    peeling of the loop).  */
2373
2374 static void
2375 apply_opt_in_copies (struct opt_info *opt_info,
2376                      unsigned n_copies, bool unrolling,
2377                      bool rewrite_original_loop)
2378 {
2379   unsigned i, delta;
2380   basic_block bb, orig_bb;
2381   rtx insn, orig_insn, next;
2382   struct iv_to_split ivts_templ, *ivts;
2383   struct var_to_expand ve_templ, *ves;
2384
2385   /* Sanity check -- we need to put initialization in the original loop
2386      body.  */
2387   gcc_assert (!unrolling || rewrite_original_loop);
2388
2389   /* Allocate the basic variables (i0).  */
2390   if (opt_info->insns_to_split)
2391     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2392       allocate_basic_variable (ivts);
2393
2394   for (i = opt_info->first_new_block;
2395        i < (unsigned) last_basic_block_for_fn (cfun);
2396        i++)
2397     {
2398       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2399       orig_bb = get_bb_original (bb);
2400
2401       /* bb->aux holds position in copy sequence initialized by
2402          duplicate_loop_to_header_edge.  */
2403       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2404                                         unrolling);
2405       bb->aux = 0;
2406       orig_insn = BB_HEAD (orig_bb);
2407       FOR_BB_INSNS_SAFE (bb, insn, next)
2408         {
2409           if (!INSN_P (insn)
2410               || (DEBUG_INSN_P (insn)
2411                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2412             continue;
2413
2414           while (!INSN_P (orig_insn)
2415                  || (DEBUG_INSN_P (orig_insn)
2416                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2417                          == LABEL_DECL)))
2418             orig_insn = NEXT_INSN (orig_insn);
2419
2420           ivts_templ.insn = orig_insn;
2421           ve_templ.insn = orig_insn;
2422
2423           /* Apply splitting iv optimization.  */
2424           if (opt_info->insns_to_split)
2425             {
2426               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2427
2428               ivts = opt_info->insns_to_split->find (&ivts_templ);
2429
2430               if (ivts)
2431                 {
2432                   gcc_assert (GET_CODE (PATTERN (insn))
2433                               == GET_CODE (PATTERN (orig_insn)));
2434
2435                   if (!delta)
2436                     insert_base_initialization (ivts, insn);
2437                   split_iv (ivts, insn, delta);
2438                 }
2439             }
2440           /* Apply variable expansion optimization.  */
2441           if (unrolling && opt_info->insns_with_var_to_expand)
2442             {
2443               ves = (struct var_to_expand *)
2444                 opt_info->insns_with_var_to_expand->find (&ve_templ);
2445               if (ves)
2446                 {
2447                   gcc_assert (GET_CODE (PATTERN (insn))
2448                               == GET_CODE (PATTERN (orig_insn)));
2449                   expand_var_during_unrolling (ves, insn);
2450                 }
2451             }
2452           orig_insn = NEXT_INSN (orig_insn);
2453         }
2454     }
2455
2456   if (!rewrite_original_loop)
2457     return;
2458
2459   /* Initialize the variable expansions in the loop preheader
2460      and take care of combining them at the loop exit.  */
2461   if (opt_info->insns_with_var_to_expand)
2462     {
2463       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2464         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2465       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2466         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2467     }
2468
2469   /* Rewrite also the original loop body.  Find them as originals of the blocks
2470      in the last copied iteration, i.e. those that have
2471      get_bb_copy (get_bb_original (bb)) == bb.  */
2472   for (i = opt_info->first_new_block;
2473        i < (unsigned) last_basic_block_for_fn (cfun);
2474        i++)
2475     {
2476       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2477       orig_bb = get_bb_original (bb);
2478       if (get_bb_copy (orig_bb) != bb)
2479         continue;
2480
2481       delta = determine_split_iv_delta (0, n_copies, unrolling);
2482       for (orig_insn = BB_HEAD (orig_bb);
2483            orig_insn != NEXT_INSN (BB_END (bb));
2484            orig_insn = next)
2485         {
2486           next = NEXT_INSN (orig_insn);
2487
2488           if (!INSN_P (orig_insn))
2489             continue;
2490
2491           ivts_templ.insn = orig_insn;
2492           if (opt_info->insns_to_split)
2493             {
2494               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2495
2496               ivts = (struct iv_to_split *)
2497                 opt_info->insns_to_split->find (&ivts_templ);
2498               if (ivts)
2499                 {
2500                   if (!delta)
2501                     insert_base_initialization (ivts, orig_insn);
2502                   split_iv (ivts, orig_insn, delta);
2503                   continue;
2504                 }
2505             }
2506
2507         }
2508     }
2509 }
2510
2511 /* Release OPT_INFO.  */
2512
2513 static void
2514 free_opt_info (struct opt_info *opt_info)
2515 {
2516   delete opt_info->insns_to_split;
2517   opt_info->insns_to_split = NULL;
2518   if (opt_info->insns_with_var_to_expand)
2519     {
2520       struct var_to_expand *ves;
2521
2522       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2523         ves->var_expansions.release ();
2524       delete opt_info->insns_with_var_to_expand;
2525       opt_info->insns_with_var_to_expand = NULL;
2526     }
2527   free (opt_info);
2528 }