gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx_insn *insn;       /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82 };
  83
  84 /* Information about accumulators to expand.  */
  85
  86 struct var_to_expand
  87 {
  88   rtx_insn *insn;                  /* The insn in that the variable expansion occurs.  */
  89   rtx reg;                         /* The accumulator which is expanded.  */
  90   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  91   struct var_to_expand *next;      /* Next entry in walking order.  */
  92   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  93                                       or multiplication.  */
  94   int expansion_count;             /* Count the number of expansions generated so far.  */
  95   int reuse_expansion;             /* The expansion we intend to reuse to expand
  96                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
  97                                       the original accumulator.  Else use
  98                                       var_expansions[REUSE_EXPANSION - 1].  */
  99 };
 100
 101 /* Hashtable helper for iv_to_split.  */
 102
 103 struct iv_split_hasher : typed_free_remove <iv_to_split>
 104 {
 105   typedef iv_to_split value_type;
 106   typedef iv_to_split compare_type;
 107   static inline hashval_t hash (const value_type *);
 108   static inline bool equal (const value_type *, const compare_type *);
 109 };
 110
 111
 112 /* A hash function for information about insns to split.  */
 113
 114 inline hashval_t
 115 iv_split_hasher::hash (const value_type *ivts)
 116 {
 117   return (hashval_t) INSN_UID (ivts->insn);
 118 }
 119
 120 /* An equality functions for information about insns to split.  */
 121
 122 inline bool
 123 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 124 {
 125   return i1->insn == i2->insn;
 126 }
 127
 128 /* Hashtable helper for iv_to_split.  */
 129
 130 struct var_expand_hasher : typed_free_remove <var_to_expand>
 131 {
 132   typedef var_to_expand value_type;
 133   typedef var_to_expand compare_type;
 134   static inline hashval_t hash (const value_type *);
 135   static inline bool equal (const value_type *, const compare_type *);
 136 };
 137
 138 /* Return a hash for VES.  */
 139
 140 inline hashval_t
 141 var_expand_hasher::hash (const value_type *ves)
 142 {
 143   return (hashval_t) INSN_UID (ves->insn);
 144 }
 145
 146 /* Return true if I1 and I2 refer to the same instruction.  */
 147
 148 inline bool
 149 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 150 {
 151   return i1->insn == i2->insn;
 152 }
 153
 154 /* Information about optimization applied in
 155    the unrolled loop.  */
 156
 157 struct opt_info
 158 {
 159   hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
 160                                                   split.  */
 161   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 162   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 163   hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
 164                                         insns with accumulators to expand.  */
 165   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 166   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 167   unsigned first_new_block;        /* The first basic block that was
 168                                       duplicated.  */
 169   basic_block loop_exit;           /* The loop exit basic block.  */
 170   basic_block loop_preheader;      /* The loop preheader basic block.  */
 171 };
 172
 173 static void decide_unrolling_and_peeling (int);
 174 static void peel_loops_completely (int);
 175 static void decide_peel_simple (struct loop *, int);
 176 static void decide_peel_once_rolling (struct loop *, int);
 177 static void decide_peel_completely (struct loop *, int);
 178 static void decide_unroll_stupid (struct loop *, int);
 179 static void decide_unroll_constant_iterations (struct loop *, int);
 180 static void decide_unroll_runtime_iterations (struct loop *, int);
 181 static void peel_loop_simple (struct loop *);
 182 static void peel_loop_completely (struct loop *);
 183 static void unroll_loop_stupid (struct loop *);
 184 static void unroll_loop_constant_iterations (struct loop *);
 185 static void unroll_loop_runtime_iterations (struct loop *);
 186 static struct opt_info *analyze_insns_in_loop (struct loop *);
 187 static void opt_info_start_duplication (struct opt_info *);
 188 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 189 static void free_opt_info (struct opt_info *);
 190 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
 191 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 192 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
 193 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
 194 static void insert_var_expansion_initialization (struct var_to_expand *,
 195                                                  basic_block);
 196 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 197                                              basic_block);
 198 static rtx get_expansion (struct var_to_expand *);
 199
 200 /* Emit a message summarizing the unroll or peel that will be
 201    performed for LOOP, along with the loop's location LOCUS, if
 202    appropriate given the dump or -fopt-info settings.  */
 203
 204 static void
 205 report_unroll_peel (struct loop *loop, location_t locus)
 206 {
 207   struct niter_desc *desc;
 208   int niters = 0;
 209   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 210
 211   if (loop->lpt_decision.decision == LPT_NONE)
 212     return;
 213
 214   if (!dump_enabled_p ())
 215     return;
 216
 217   /* In the special case where the loop never iterated, emit
 218      a different message so that we don't report an unroll by 0.
 219      This matches the equivalent message emitted during tree unrolling.  */
 220   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 221       && !loop->lpt_decision.times)
 222     {
 223       dump_printf_loc (report_flags, locus,
 224                        "loop turned into non-loop; it never loops.\n");
 225       return;
 226     }
 227
 228   desc = get_simple_loop_desc (loop);
 229
 230   if (desc->const_iter)
 231     niters = desc->niter;
 232   else if (loop->header->count)
 233     niters = expected_loop_iterations (loop);
 234
 235   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 236     dump_printf_loc (report_flags, locus,
 237                      "loop with %d iterations completely unrolled",
 238                      loop->lpt_decision.times + 1);
 239   else
 240     dump_printf_loc (report_flags, locus,
 241                      "loop %s %d times",
 242                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 243                        ? "peeled" : "unrolled"),
 244                      loop->lpt_decision.times);
 245   if (profile_info)
 246     dump_printf (report_flags,
 247                  " (header execution count %d",
 248                  (int)loop->header->count);
 249   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 250     dump_printf (report_flags,
 251                  "%s%s iterations %d)",
 252                  profile_info ? ", " : " (",
 253                  desc->const_iter ? "const" : "average",
 254                  niters);
 255   else if (profile_info)
 256     dump_printf (report_flags, ")");
 257
 258   dump_printf (report_flags, "\n");
 259 }
 260
 261 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 262 void
 263 unroll_and_peel_loops (int flags)
 264 {
 265   struct loop *loop;
 266   bool changed = false;
 267
 268   /* First perform complete loop peeling (it is almost surely a win,
 269      and affects parameters for further decision a lot).  */
 270   peel_loops_completely (flags);
 271
 272   /* Now decide rest of unrolling and peeling.  */
 273   decide_unrolling_and_peeling (flags);
 274
 275   /* Scan the loops, inner ones first.  */
 276   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 277     {
 278       /* And perform the appropriate transformations.  */
 279       switch (loop->lpt_decision.decision)
 280         {
 281         case LPT_PEEL_COMPLETELY:
 282           /* Already done.  */
 283           gcc_unreachable ();
 284         case LPT_PEEL_SIMPLE:
 285           peel_loop_simple (loop);
 286           changed = true;
 287           break;
 288         case LPT_UNROLL_CONSTANT:
 289           unroll_loop_constant_iterations (loop);
 290           changed = true;
 291           break;
 292         case LPT_UNROLL_RUNTIME:
 293           unroll_loop_runtime_iterations (loop);
 294           changed = true;
 295           break;
 296         case LPT_UNROLL_STUPID:
 297           unroll_loop_stupid (loop);
 298           changed = true;
 299           break;
 300         case LPT_NONE:
 301           break;
 302         default:
 303           gcc_unreachable ();
 304         }
 305     }
 306
 307     if (changed)
 308       {
 309         calculate_dominance_info (CDI_DOMINATORS);
 310         fix_loop_structure (NULL);
 311       }
 312
 313   iv_analysis_done ();
 314 }
 315
 316 /* Check whether exit of the LOOP is at the end of loop body.  */
 317
 318 static bool
 319 loop_exit_at_end_p (struct loop *loop)
 320 {
 321   struct niter_desc *desc = get_simple_loop_desc (loop);
 322   rtx_insn *insn;
 323
 324   if (desc->in_edge->dest != loop->latch)
 325     return false;
 326
 327   /* Check that the latch is empty.  */
 328   FOR_BB_INSNS (loop->latch, insn)
 329     {
 330       if (NONDEBUG_INSN_P (insn))
 331         return false;
 332     }
 333
 334   return true;
 335 }
 336
 337 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 338 static void
 339 peel_loops_completely (int flags)
 340 {
 341   struct loop *loop;
 342   bool changed = false;
 343
 344   /* Scan the loops, the inner ones first.  */
 345   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 346     {
 347       loop->lpt_decision.decision = LPT_NONE;
 348       location_t locus = get_loop_location (loop);
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (TDF_RTL, locus,
 352                          ";; *** Considering loop %d at BB %d for "
 353                          "complete peeling ***\n",
 354                          loop->num, loop->header->index);
 355
 356       loop->ninsns = num_loop_insns (loop);
 357
 358       decide_peel_once_rolling (loop, flags);
 359       if (loop->lpt_decision.decision == LPT_NONE)
 360         decide_peel_completely (loop, flags);
 361
 362       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 363         {
 364           report_unroll_peel (loop, locus);
 365           peel_loop_completely (loop);
 366           changed = true;
 367         }
 368     }
 369
 370     if (changed)
 371       {
 372         calculate_dominance_info (CDI_DOMINATORS);
 373         fix_loop_structure (NULL);
 374       }
 375 }
 376
 377 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 378 static void
 379 decide_unrolling_and_peeling (int flags)
 380 {
 381   struct loop *loop;
 382
 383   /* Scan the loops, inner ones first.  */
 384   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 385     {
 386       loop->lpt_decision.decision = LPT_NONE;
 387       location_t locus = get_loop_location (loop);
 388
 389       if (dump_enabled_p ())
 390         dump_printf_loc (TDF_RTL, locus,
 391                          ";; *** Considering loop %d at BB %d for "
 392                          "unrolling and peeling ***\n",
 393                          loop->num, loop->header->index);
 394
 395       /* Do not peel cold areas.  */
 396       if (optimize_loop_for_size_p (loop))
 397         {
 398           if (dump_file)
 399             fprintf (dump_file, ";; Not considering loop, cold area\n");
 400           continue;
 401         }
 402
 403       /* Can the loop be manipulated?  */
 404       if (!can_duplicate_loop_p (loop))
 405         {
 406           if (dump_file)
 407             fprintf (dump_file,
 408                      ";; Not considering loop, cannot duplicate\n");
 409           continue;
 410         }
 411
 412       /* Skip non-innermost loops.  */
 413       if (loop->inner)
 414         {
 415           if (dump_file)
 416             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 417           continue;
 418         }
 419
 420       loop->ninsns = num_loop_insns (loop);
 421       loop->av_ninsns = average_num_loop_insns (loop);
 422
 423       /* Try transformations one by one in decreasing order of
 424          priority.  */
 425
 426       decide_unroll_constant_iterations (loop, flags);
 427       if (loop->lpt_decision.decision == LPT_NONE)
 428         decide_unroll_runtime_iterations (loop, flags);
 429       if (loop->lpt_decision.decision == LPT_NONE)
 430         decide_unroll_stupid (loop, flags);
 431       if (loop->lpt_decision.decision == LPT_NONE)
 432         decide_peel_simple (loop, flags);
 433
 434       report_unroll_peel (loop, locus);
 435     }
 436 }
 437
 438 /* Decide whether the LOOP is once rolling and suitable for complete
 439    peeling.  */
 440 static void
 441 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 442 {
 443   struct niter_desc *desc;
 444
 445   if (dump_file)
 446     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 447
 448   /* Is the loop small enough?  */
 449   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 450     {
 451       if (dump_file)
 452         fprintf (dump_file, ";; Not considering loop, is too big\n");
 453       return;
 454     }
 455
 456   /* Check for simple loops.  */
 457   desc = get_simple_loop_desc (loop);
 458
 459   /* Check number of iterations.  */
 460   if (!desc->simple_p
 461       || desc->assumptions
 462       || desc->infinite
 463       || !desc->const_iter
 464       || (desc->niter != 0
 465           && get_max_loop_iterations_int (loop) != 0))
 466     {
 467       if (dump_file)
 468         fprintf (dump_file,
 469                  ";; Unable to prove that the loop rolls exactly once\n");
 470       return;
 471     }
 472
 473   /* Success.  */
 474   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 475 }
 476
 477 /* Decide whether the LOOP is suitable for complete peeling.  */
 478 static void
 479 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 480 {
 481   unsigned npeel;
 482   struct niter_desc *desc;
 483
 484   if (dump_file)
 485     fprintf (dump_file, "\n;; Considering peeling completely\n");
 486
 487   /* Skip non-innermost loops.  */
 488   if (loop->inner)
 489     {
 490       if (dump_file)
 491         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 492       return;
 493     }
 494
 495   /* Do not peel cold areas.  */
 496   if (optimize_loop_for_size_p (loop))
 497     {
 498       if (dump_file)
 499         fprintf (dump_file, ";; Not considering loop, cold area\n");
 500       return;
 501     }
 502
 503   /* Can the loop be manipulated?  */
 504   if (!can_duplicate_loop_p (loop))
 505     {
 506       if (dump_file)
 507         fprintf (dump_file,
 508                  ";; Not considering loop, cannot duplicate\n");
 509       return;
 510     }
 511
 512   /* npeel = number of iterations to peel.  */
 513   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 514   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 515     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 516
 517   /* Is the loop small enough?  */
 518   if (!npeel)
 519     {
 520       if (dump_file)
 521         fprintf (dump_file, ";; Not considering loop, is too big\n");
 522       return;
 523     }
 524
 525   /* Check for simple loops.  */
 526   desc = get_simple_loop_desc (loop);
 527
 528   /* Check number of iterations.  */
 529   if (!desc->simple_p
 530       || desc->assumptions
 531       || !desc->const_iter
 532       || desc->infinite)
 533     {
 534       if (dump_file)
 535         fprintf (dump_file,
 536                  ";; Unable to prove that the loop iterates constant times\n");
 537       return;
 538     }
 539
 540   if (desc->niter > npeel - 1)
 541     {
 542       if (dump_file)
 543         {
 544           fprintf (dump_file,
 545                    ";; Not peeling loop completely, rolls too much (");
 546           fprintf (dump_file, "%"PRId64, desc->niter);
 547           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 548         }
 549       return;
 550     }
 551
 552   /* Success.  */
 553   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 554 }
 555
 556 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 557    completely.  The transformation done:
 558
 559    for (i = 0; i < 4; i++)
 560      body;
 561
 562    ==>
 563
 564    i = 0;
 565    body; i++;
 566    body; i++;
 567    body; i++;
 568    body; i++;
 569    */
 570 static void
 571 peel_loop_completely (struct loop *loop)
 572 {
 573   sbitmap wont_exit;
 574   unsigned HOST_WIDE_INT npeel;
 575   unsigned i;
 576   edge ein;
 577   struct niter_desc *desc = get_simple_loop_desc (loop);
 578   struct opt_info *opt_info = NULL;
 579
 580   npeel = desc->niter;
 581
 582   if (npeel)
 583     {
 584       bool ok;
 585
 586       wont_exit = sbitmap_alloc (npeel + 1);
 587       bitmap_ones (wont_exit);
 588       bitmap_clear_bit (wont_exit, 0);
 589       if (desc->noloop_assumptions)
 590         bitmap_clear_bit (wont_exit, 1);
 591
 592       auto_vec<edge> remove_edges;
 593       if (flag_split_ivs_in_unroller)
 594         opt_info = analyze_insns_in_loop (loop);
 595
 596       opt_info_start_duplication (opt_info);
 597       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 598                                           npeel,
 599                                           wont_exit, desc->out_edge,
 600                                           &remove_edges,
 601                                           DLTHE_FLAG_UPDATE_FREQ
 602                                           | DLTHE_FLAG_COMPLETTE_PEEL
 603                                           | (opt_info
 604                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 605       gcc_assert (ok);
 606
 607       free (wont_exit);
 608
 609       if (opt_info)
 610         {
 611           apply_opt_in_copies (opt_info, npeel, false, true);
 612           free_opt_info (opt_info);
 613         }
 614
 615       /* Remove the exit edges.  */
 616       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 617         remove_path (ein);
 618     }
 619
 620   ein = desc->in_edge;
 621   free_simple_loop_desc (loop);
 622
 623   /* Now remove the unreachable part of the last iteration and cancel
 624      the loop.  */
 625   remove_path (ein);
 626
 627   if (dump_file)
 628     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 629 }
 630
 631 /* Decide whether to unroll LOOP iterating constant number of times
 632    and how much.  */
 633
 634 static void
 635 decide_unroll_constant_iterations (struct loop *loop, int flags)
 636 {
 637   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 638   struct niter_desc *desc;
 639   widest_int iterations;
 640
 641   if (!(flags & UAP_UNROLL))
 642     {
 643       /* We were not asked to, just return back silently.  */
 644       return;
 645     }
 646
 647   if (dump_file)
 648     fprintf (dump_file,
 649              "\n;; Considering unrolling loop with constant "
 650              "number of iterations\n");
 651
 652   /* nunroll = total number of copies of the original loop body in
 653      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 654   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 655   nunroll_by_av
 656     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 657   if (nunroll > nunroll_by_av)
 658     nunroll = nunroll_by_av;
 659   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 660     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 661
 662   if (targetm.loop_unroll_adjust)
 663     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 664
 665   /* Skip big loops.  */
 666   if (nunroll <= 1)
 667     {
 668       if (dump_file)
 669         fprintf (dump_file, ";; Not considering loop, is too big\n");
 670       return;
 671     }
 672
 673   /* Check for simple loops.  */
 674   desc = get_simple_loop_desc (loop);
 675
 676   /* Check number of iterations.  */
 677   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 678     {
 679       if (dump_file)
 680         fprintf (dump_file,
 681                  ";; Unable to prove that the loop iterates constant times\n");
 682       return;
 683     }
 684
 685   /* Check whether the loop rolls enough to consider.
 686      Consult also loop bounds and profile; in the case the loop has more
 687      than one exit it may well loop less than determined maximal number
 688      of iterations.  */
 689   if (desc->niter < 2 * nunroll
 690       || ((get_estimated_loop_iterations (loop, &iterations)
 691            || get_max_loop_iterations (loop, &iterations))
 692           && wi::ltu_p (iterations, 2 * nunroll)))
 693     {
 694       if (dump_file)
 695         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 696       return;
 697     }
 698
 699   /* Success; now compute number of iterations to unroll.  We alter
 700      nunroll so that as few as possible copies of loop body are
 701      necessary, while still not decreasing the number of unrollings
 702      too much (at most by 1).  */
 703   best_copies = 2 * nunroll + 10;
 704
 705   i = 2 * nunroll + 2;
 706   if (i - 1 >= desc->niter)
 707     i = desc->niter - 2;
 708
 709   for (; i >= nunroll - 1; i--)
 710     {
 711       unsigned exit_mod = desc->niter % (i + 1);
 712
 713       if (!loop_exit_at_end_p (loop))
 714         n_copies = exit_mod + i + 1;
 715       else if (exit_mod != (unsigned) i
 716                || desc->noloop_assumptions != NULL_RTX)
 717         n_copies = exit_mod + i + 2;
 718       else
 719         n_copies = i + 1;
 720
 721       if (n_copies < best_copies)
 722         {
 723           best_copies = n_copies;
 724           best_unroll = i;
 725         }
 726     }
 727
 728   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 729   loop->lpt_decision.times = best_unroll;
 730 }
 731
 732 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 733    The transformation does this:
 734
 735    for (i = 0; i < 102; i++)
 736      body;
 737
 738    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 739
 740    i = 0;
 741    body; i++;
 742    body; i++;
 743    while (i < 102)
 744      {
 745        body; i++;
 746        body; i++;
 747        body; i++;
 748        body; i++;
 749      }
 750   */
 751 static void
 752 unroll_loop_constant_iterations (struct loop *loop)
 753 {
 754   unsigned HOST_WIDE_INT niter;
 755   unsigned exit_mod;
 756   sbitmap wont_exit;
 757   unsigned i;
 758   edge e;
 759   unsigned max_unroll = loop->lpt_decision.times;
 760   struct niter_desc *desc = get_simple_loop_desc (loop);
 761   bool exit_at_end = loop_exit_at_end_p (loop);
 762   struct opt_info *opt_info = NULL;
 763   bool ok;
 764
 765   niter = desc->niter;
 766
 767   /* Should not get here (such loop should be peeled instead).  */
 768   gcc_assert (niter > max_unroll + 1);
 769
 770   exit_mod = niter % (max_unroll + 1);
 771
 772   wont_exit = sbitmap_alloc (max_unroll + 1);
 773   bitmap_ones (wont_exit);
 774
 775   auto_vec<edge> remove_edges;
 776   if (flag_split_ivs_in_unroller
 777       || flag_variable_expansion_in_unroller)
 778     opt_info = analyze_insns_in_loop (loop);
 779
 780   if (!exit_at_end)
 781     {
 782       /* The exit is not at the end of the loop; leave exit test
 783          in the first copy, so that the loops that start with test
 784          of exit condition have continuous body after unrolling.  */
 785
 786       if (dump_file)
 787         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 788
 789       /* Peel exit_mod iterations.  */
 790       bitmap_clear_bit (wont_exit, 0);
 791       if (desc->noloop_assumptions)
 792         bitmap_clear_bit (wont_exit, 1);
 793
 794       if (exit_mod)
 795         {
 796           opt_info_start_duplication (opt_info);
 797           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 798                                               exit_mod,
 799                                               wont_exit, desc->out_edge,
 800                                               &remove_edges,
 801                                               DLTHE_FLAG_UPDATE_FREQ
 802                                               | (opt_info && exit_mod > 1
 803                                                  ? DLTHE_RECORD_COPY_NUMBER
 804                                                    : 0));
 805           gcc_assert (ok);
 806
 807           if (opt_info && exit_mod > 1)
 808             apply_opt_in_copies (opt_info, exit_mod, false, false);
 809
 810           desc->noloop_assumptions = NULL_RTX;
 811           desc->niter -= exit_mod;
 812           loop->nb_iterations_upper_bound -= exit_mod;
 813           if (loop->any_estimate
 814               && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
 815             loop->nb_iterations_estimate -= exit_mod;
 816           else
 817             loop->any_estimate = false;
 818         }
 819
 820       bitmap_set_bit (wont_exit, 1);
 821     }
 822   else
 823     {
 824       /* Leave exit test in last copy, for the same reason as above if
 825          the loop tests the condition at the end of loop body.  */
 826
 827       if (dump_file)
 828         fprintf (dump_file, ";; Condition at end of loop.\n");
 829
 830       /* We know that niter >= max_unroll + 2; so we do not need to care of
 831          case when we would exit before reaching the loop.  So just peel
 832          exit_mod + 1 iterations.  */
 833       if (exit_mod != max_unroll
 834           || desc->noloop_assumptions)
 835         {
 836           bitmap_clear_bit (wont_exit, 0);
 837           if (desc->noloop_assumptions)
 838             bitmap_clear_bit (wont_exit, 1);
 839
 840           opt_info_start_duplication (opt_info);
 841           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 842                                               exit_mod + 1,
 843                                               wont_exit, desc->out_edge,
 844                                               &remove_edges,
 845                                               DLTHE_FLAG_UPDATE_FREQ
 846                                               | (opt_info && exit_mod > 0
 847                                                  ? DLTHE_RECORD_COPY_NUMBER
 848                                                    : 0));
 849           gcc_assert (ok);
 850
 851           if (opt_info && exit_mod > 0)
 852             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 853
 854           desc->niter -= exit_mod + 1;
 855           loop->nb_iterations_upper_bound -= exit_mod + 1;
 856           if (loop->any_estimate
 857               && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
 858             loop->nb_iterations_estimate -= exit_mod + 1;
 859           else
 860             loop->any_estimate = false;
 861           desc->noloop_assumptions = NULL_RTX;
 862
 863           bitmap_set_bit (wont_exit, 0);
 864           bitmap_set_bit (wont_exit, 1);
 865         }
 866
 867       bitmap_clear_bit (wont_exit, max_unroll);
 868     }
 869
 870   /* Now unroll the loop.  */
 871
 872   opt_info_start_duplication (opt_info);
 873   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 874                                       max_unroll,
 875                                       wont_exit, desc->out_edge,
 876                                       &remove_edges,
 877                                       DLTHE_FLAG_UPDATE_FREQ
 878                                       | (opt_info
 879                                          ? DLTHE_RECORD_COPY_NUMBER
 880                                            : 0));
 881   gcc_assert (ok);
 882
 883   if (opt_info)
 884     {
 885       apply_opt_in_copies (opt_info, max_unroll, true, true);
 886       free_opt_info (opt_info);
 887     }
 888
 889   free (wont_exit);
 890
 891   if (exit_at_end)
 892     {
 893       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 894       /* Find a new in and out edge; they are in the last copy we have made.  */
 895
 896       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 897         {
 898           desc->out_edge = EDGE_SUCC (exit_block, 0);
 899           desc->in_edge = EDGE_SUCC (exit_block, 1);
 900         }
 901       else
 902         {
 903           desc->out_edge = EDGE_SUCC (exit_block, 1);
 904           desc->in_edge = EDGE_SUCC (exit_block, 0);
 905         }
 906     }
 907
 908   desc->niter /= max_unroll + 1;
 909   loop->nb_iterations_upper_bound
 910     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
 911   if (loop->any_estimate)
 912     loop->nb_iterations_estimate
 913       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
 914   desc->niter_expr = GEN_INT (desc->niter);
 915
 916   /* Remove the edges.  */
 917   FOR_EACH_VEC_ELT (remove_edges, i, e)
 918     remove_path (e);
 919
 920   if (dump_file)
 921     fprintf (dump_file,
 922              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 923              max_unroll, num_loop_insns (loop));
 924 }
 925
 926 /* Decide whether to unroll LOOP iterating runtime computable number of times
 927    and how much.  */
 928 static void
 929 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 930 {
 931   unsigned nunroll, nunroll_by_av, i;
 932   struct niter_desc *desc;
 933   widest_int iterations;
 934
 935   if (!(flags & UAP_UNROLL))
 936     {
 937       /* We were not asked to, just return back silently.  */
 938       return;
 939     }
 940
 941   if (dump_file)
 942     fprintf (dump_file,
 943              "\n;; Considering unrolling loop with runtime "
 944              "computable number of iterations\n");
 945
 946   /* nunroll = total number of copies of the original loop body in
 947      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 948   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 949   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 950   if (nunroll > nunroll_by_av)
 951     nunroll = nunroll_by_av;
 952   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 953     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 954
 955   if (targetm.loop_unroll_adjust)
 956     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 957
 958   /* Skip big loops.  */
 959   if (nunroll <= 1)
 960     {
 961       if (dump_file)
 962         fprintf (dump_file, ";; Not considering loop, is too big\n");
 963       return;
 964     }
 965
 966   /* Check for simple loops.  */
 967   desc = get_simple_loop_desc (loop);
 968
 969   /* Check simpleness.  */
 970   if (!desc->simple_p || desc->assumptions)
 971     {
 972       if (dump_file)
 973         fprintf (dump_file,
 974                  ";; Unable to prove that the number of iterations "
 975                  "can be counted in runtime\n");
 976       return;
 977     }
 978
 979   if (desc->const_iter)
 980     {
 981       if (dump_file)
 982         fprintf (dump_file, ";; Loop iterates constant times\n");
 983       return;
 984     }
 985
 986   /* Check whether the loop rolls.  */
 987   if ((get_estimated_loop_iterations (loop, &iterations)
 988        || get_max_loop_iterations (loop, &iterations))
 989       && wi::ltu_p (iterations, 2 * nunroll))
 990     {
 991       if (dump_file)
 992         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 993       return;
 994     }
 995
 996   /* Success; now force nunroll to be power of 2, as we are unable to
 997      cope with overflows in computation of number of iterations.  */
 998   for (i = 1; 2 * i <= nunroll; i *= 2)
 999     continue;
1000
1001   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1002   loop->lpt_decision.times = i - 1;
1003 }
1004
1005 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1006    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1007    and NULL is returned instead.  */
1008
1009 basic_block
1010 split_edge_and_insert (edge e, rtx_insn *insns)
1011 {
1012   basic_block bb;
1013
1014   if (!insns)
1015     return NULL;
1016   bb = split_edge (e);
1017   emit_insn_after (insns, BB_END (bb));
1018
1019   /* ??? We used to assume that INSNS can contain control flow insns, and
1020      that we had to try to find sub basic blocks in BB to maintain a valid
1021      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1022      and call break_superblocks when going out of cfglayout mode.  But it
1023      turns out that this never happens; and that if it does ever happen,
1024      the verify_flow_info at the end of the RTL loop passes would fail.
1025
1026      There are two reasons why we expected we could have control flow insns
1027      in INSNS.  The first is when a comparison has to be done in parts, and
1028      the second is when the number of iterations is computed for loops with
1029      the number of iterations known at runtime.  In both cases, test cases
1030      to get control flow in INSNS appear to be impossible to construct:
1031
1032       * If do_compare_rtx_and_jump needs several branches to do comparison
1033         in a mode that needs comparison by parts, we cannot analyze the
1034         number of iterations of the loop, and we never get to unrolling it.
1035
1036       * The code in expand_divmod that was suspected to cause creation of
1037         branching code seems to be only accessed for signed division.  The
1038         divisions used by # of iterations analysis are always unsigned.
1039         Problems might arise on architectures that emits branching code
1040         for some operations that may appear in the unroller (especially
1041         for division), but we have no such architectures.
1042
1043      Considering all this, it was decided that we should for now assume
1044      that INSNS can in theory contain control flow insns, but in practice
1045      it never does.  So we don't handle the theoretical case, and should
1046      a real failure ever show up, we have a pretty good clue for how to
1047      fix it.  */
1048
1049   return bb;
1050 }
1051
1052 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1053    true, with probability PROB.  If CINSN is not NULL, it is the insn to copy
1054    in order to create a jump.  */
1055
1056 static rtx_insn *
1057 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1058                       rtx_insn *cinsn)
1059 {
1060   rtx_insn *seq, *jump;
1061   rtx cond;
1062   enum machine_mode mode;
1063
1064   mode = GET_MODE (op0);
1065   if (mode == VOIDmode)
1066     mode = GET_MODE (op1);
1067
1068   start_sequence ();
1069   if (GET_MODE_CLASS (mode) == MODE_CC)
1070     {
1071       /* A hack -- there seems to be no easy generic way how to make a
1072          conditional jump from a ccmode comparison.  */
1073       gcc_assert (cinsn);
1074       cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1075       gcc_assert (GET_CODE (cond) == comp);
1076       gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1077       gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1078       emit_jump_insn (copy_insn (PATTERN (cinsn)));
1079       jump = get_last_insn ();
1080       gcc_assert (JUMP_P (jump));
1081       JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1082       LABEL_NUSES (JUMP_LABEL (jump))++;
1083       redirect_jump (jump, label, 0);
1084     }
1085   else
1086     {
1087       gcc_assert (!cinsn);
1088
1089       op0 = force_operand (op0, NULL_RTX);
1090       op1 = force_operand (op1, NULL_RTX);
1091       do_compare_rtx_and_jump (op0, op1, comp, 0,
1092                                mode, NULL_RTX, NULL_RTX, label, -1);
1093       jump = get_last_insn ();
1094       gcc_assert (JUMP_P (jump));
1095       JUMP_LABEL (jump) = label;
1096       LABEL_NUSES (label)++;
1097     }
1098   add_int_reg_note (jump, REG_BR_PROB, prob);
1099
1100   seq = get_insns ();
1101   end_sequence ();
1102
1103   return seq;
1104 }
1105
1106 /* Unroll LOOP for which we are able to count number of iterations in runtime
1107    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1108    extra care for case n < 0):
1109
1110    for (i = 0; i < n; i++)
1111      body;
1112
1113    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1114
1115    i = 0;
1116    mod = n % 4;
1117
1118    switch (mod)
1119      {
1120        case 3:
1121          body; i++;
1122        case 2:
1123          body; i++;
1124        case 1:
1125          body; i++;
1126        case 0: ;
1127      }
1128
1129    while (i < n)
1130      {
1131        body; i++;
1132        body; i++;
1133        body; i++;
1134        body; i++;
1135      }
1136    */
1137 static void
1138 unroll_loop_runtime_iterations (struct loop *loop)
1139 {
1140   rtx old_niter, niter, tmp;
1141   rtx_insn *init_code, *branch_code;
1142   unsigned i, j, p;
1143   basic_block preheader, *body, swtch, ezc_swtch;
1144   sbitmap wont_exit;
1145   int may_exit_copy;
1146   unsigned n_peel;
1147   edge e;
1148   bool extra_zero_check, last_may_exit;
1149   unsigned max_unroll = loop->lpt_decision.times;
1150   struct niter_desc *desc = get_simple_loop_desc (loop);
1151   bool exit_at_end = loop_exit_at_end_p (loop);
1152   struct opt_info *opt_info = NULL;
1153   bool ok;
1154
1155   if (flag_split_ivs_in_unroller
1156       || flag_variable_expansion_in_unroller)
1157     opt_info = analyze_insns_in_loop (loop);
1158
1159   /* Remember blocks whose dominators will have to be updated.  */
1160   auto_vec<basic_block> dom_bbs;
1161
1162   body = get_loop_body (loop);
1163   for (i = 0; i < loop->num_nodes; i++)
1164     {
1165       vec<basic_block> ldom;
1166       basic_block bb;
1167
1168       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1169       FOR_EACH_VEC_ELT (ldom, j, bb)
1170         if (!flow_bb_inside_loop_p (loop, bb))
1171           dom_bbs.safe_push (bb);
1172
1173       ldom.release ();
1174     }
1175   free (body);
1176
1177   if (!exit_at_end)
1178     {
1179       /* Leave exit in first copy (for explanation why see comment in
1180          unroll_loop_constant_iterations).  */
1181       may_exit_copy = 0;
1182       n_peel = max_unroll - 1;
1183       extra_zero_check = true;
1184       last_may_exit = false;
1185     }
1186   else
1187     {
1188       /* Leave exit in last copy (for explanation why see comment in
1189          unroll_loop_constant_iterations).  */
1190       may_exit_copy = max_unroll;
1191       n_peel = max_unroll;
1192       extra_zero_check = false;
1193       last_may_exit = true;
1194     }
1195
1196   /* Get expression for number of iterations.  */
1197   start_sequence ();
1198   old_niter = niter = gen_reg_rtx (desc->mode);
1199   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1200   if (tmp != niter)
1201     emit_move_insn (niter, tmp);
1202
1203   /* Count modulo by ANDing it with max_unroll; we use the fact that
1204      the number of unrollings is a power of two, and thus this is correct
1205      even if there is overflow in the computation.  */
1206   niter = expand_simple_binop (desc->mode, AND,
1207                                niter, gen_int_mode (max_unroll, desc->mode),
1208                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1209
1210   init_code = get_insns ();
1211   end_sequence ();
1212   unshare_all_rtl_in_chain (init_code);
1213
1214   /* Precondition the loop.  */
1215   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1216
1217   auto_vec<edge> remove_edges;
1218
1219   wont_exit = sbitmap_alloc (max_unroll + 2);
1220
1221   /* Peel the first copy of loop body (almost always we must leave exit test
1222      here; the only exception is when we have extra zero check and the number
1223      of iterations is reliable.  Also record the place of (possible) extra
1224      zero check.  */
1225   bitmap_clear (wont_exit);
1226   if (extra_zero_check
1227       && !desc->noloop_assumptions)
1228     bitmap_set_bit (wont_exit, 1);
1229   ezc_swtch = loop_preheader_edge (loop)->src;
1230   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1231                                       1, wont_exit, desc->out_edge,
1232                                       &remove_edges,
1233                                       DLTHE_FLAG_UPDATE_FREQ);
1234   gcc_assert (ok);
1235
1236   /* Record the place where switch will be built for preconditioning.  */
1237   swtch = split_edge (loop_preheader_edge (loop));
1238
1239   for (i = 0; i < n_peel; i++)
1240     {
1241       /* Peel the copy.  */
1242       bitmap_clear (wont_exit);
1243       if (i != n_peel - 1 || !last_may_exit)
1244         bitmap_set_bit (wont_exit, 1);
1245       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1246                                           1, wont_exit, desc->out_edge,
1247                                           &remove_edges,
1248                                           DLTHE_FLAG_UPDATE_FREQ);
1249       gcc_assert (ok);
1250
1251       /* Create item for switch.  */
1252       j = n_peel - i - (extra_zero_check ? 0 : 1);
1253       p = REG_BR_PROB_BASE / (i + 2);
1254
1255       preheader = split_edge (loop_preheader_edge (loop));
1256       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1257                                           block_label (preheader), p,
1258                                           NULL);
1259
1260       /* We rely on the fact that the compare and jump cannot be optimized out,
1261          and hence the cfg we create is correct.  */
1262       gcc_assert (branch_code != NULL_RTX);
1263
1264       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1265       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1266       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1267       e = make_edge (swtch, preheader,
1268                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1269       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1270       e->probability = p;
1271     }
1272
1273   if (extra_zero_check)
1274     {
1275       /* Add branch for zero iterations.  */
1276       p = REG_BR_PROB_BASE / (max_unroll + 1);
1277       swtch = ezc_swtch;
1278       preheader = split_edge (loop_preheader_edge (loop));
1279       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1280                                           block_label (preheader), p,
1281                                           NULL);
1282       gcc_assert (branch_code != NULL_RTX);
1283
1284       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1285       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1286       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1287       e = make_edge (swtch, preheader,
1288                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1289       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1290       e->probability = p;
1291     }
1292
1293   /* Recount dominators for outer blocks.  */
1294   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1295
1296   /* And unroll loop.  */
1297
1298   bitmap_ones (wont_exit);
1299   bitmap_clear_bit (wont_exit, may_exit_copy);
1300   opt_info_start_duplication (opt_info);
1301
1302   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1303                                       max_unroll,
1304                                       wont_exit, desc->out_edge,
1305                                       &remove_edges,
1306                                       DLTHE_FLAG_UPDATE_FREQ
1307                                       | (opt_info
1308                                          ? DLTHE_RECORD_COPY_NUMBER
1309                                            : 0));
1310   gcc_assert (ok);
1311
1312   if (opt_info)
1313     {
1314       apply_opt_in_copies (opt_info, max_unroll, true, true);
1315       free_opt_info (opt_info);
1316     }
1317
1318   free (wont_exit);
1319
1320   if (exit_at_end)
1321     {
1322       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1323       /* Find a new in and out edge; they are in the last copy we have
1324          made.  */
1325
1326       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1327         {
1328           desc->out_edge = EDGE_SUCC (exit_block, 0);
1329           desc->in_edge = EDGE_SUCC (exit_block, 1);
1330         }
1331       else
1332         {
1333           desc->out_edge = EDGE_SUCC (exit_block, 1);
1334           desc->in_edge = EDGE_SUCC (exit_block, 0);
1335         }
1336     }
1337
1338   /* Remove the edges.  */
1339   FOR_EACH_VEC_ELT (remove_edges, i, e)
1340     remove_path (e);
1341
1342   /* We must be careful when updating the number of iterations due to
1343      preconditioning and the fact that the value must be valid at entry
1344      of the loop.  After passing through the above code, we see that
1345      the correct new number of iterations is this:  */
1346   gcc_assert (!desc->const_iter);
1347   desc->niter_expr =
1348     simplify_gen_binary (UDIV, desc->mode, old_niter,
1349                          gen_int_mode (max_unroll + 1, desc->mode));
1350   loop->nb_iterations_upper_bound
1351     = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1352   if (loop->any_estimate)
1353     loop->nb_iterations_estimate
1354       = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1355   if (exit_at_end)
1356     {
1357       desc->niter_expr =
1358         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1359       desc->noloop_assumptions = NULL_RTX;
1360       --loop->nb_iterations_upper_bound;
1361       if (loop->any_estimate
1362           && loop->nb_iterations_estimate != 0)
1363         --loop->nb_iterations_estimate;
1364       else
1365         loop->any_estimate = false;
1366     }
1367
1368   if (dump_file)
1369     fprintf (dump_file,
1370              ";; Unrolled loop %d times, counting # of iterations "
1371              "in runtime, %i insns\n",
1372              max_unroll, num_loop_insns (loop));
1373 }
1374
1375 /* Decide whether to simply peel LOOP and how much.  */
1376 static void
1377 decide_peel_simple (struct loop *loop, int flags)
1378 {
1379   unsigned npeel;
1380   widest_int iterations;
1381
1382   if (!(flags & UAP_PEEL))
1383     {
1384       /* We were not asked to, just return back silently.  */
1385       return;
1386     }
1387
1388   if (dump_file)
1389     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1390
1391   /* npeel = number of iterations to peel.  */
1392   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1393   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1394     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1395
1396   /* Skip big loops.  */
1397   if (!npeel)
1398     {
1399       if (dump_file)
1400         fprintf (dump_file, ";; Not considering loop, is too big\n");
1401       return;
1402     }
1403
1404   /* Do not simply peel loops with branches inside -- it increases number
1405      of mispredicts.
1406      Exception is when we do have profile and we however have good chance
1407      to peel proper number of iterations loop will iterate in practice.
1408      TODO: this heuristic needs tunning; while for complette unrolling
1409      the branch inside loop mostly eliminates any improvements, for
1410      peeling it is not the case.  Also a function call inside loop is
1411      also branch from branch prediction POV (and probably better reason
1412      to not unroll/peel).  */
1413   if (num_loop_branches (loop) > 1
1414       && profile_status_for_fn (cfun) != PROFILE_READ)
1415     {
1416       if (dump_file)
1417         fprintf (dump_file, ";; Not peeling, contains branches\n");
1418       return;
1419     }
1420
1421   /* If we have realistic estimate on number of iterations, use it.  */
1422   if (get_estimated_loop_iterations (loop, &iterations))
1423     {
1424       if (wi::leu_p (npeel, iterations))
1425         {
1426           if (dump_file)
1427             {
1428               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1429               fprintf (dump_file, "%"PRId64,
1430                        (int64_t) (iterations.to_shwi () + 1));
1431               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1432                        npeel);
1433             }
1434           return;
1435         }
1436       npeel = iterations.to_shwi () + 1;
1437     }
1438   /* If we have small enough bound on iterations, we can still peel (completely
1439      unroll).  */
1440   else if (get_max_loop_iterations (loop, &iterations)
1441            && wi::ltu_p (iterations, npeel))
1442     npeel = iterations.to_shwi () + 1;
1443   else
1444     {
1445       /* For now we have no good heuristics to decide whether loop peeling
1446          will be effective, so disable it.  */
1447       if (dump_file)
1448         fprintf (dump_file,
1449                  ";; Not peeling loop, no evidence it will be profitable\n");
1450       return;
1451     }
1452
1453   /* Success.  */
1454   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1455   loop->lpt_decision.times = npeel;
1456 }
1457
1458 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1459
1460    while (cond)
1461      body;
1462
1463    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1464
1465    if (!cond) goto end;
1466    body;
1467    if (!cond) goto end;
1468    body;
1469    if (!cond) goto end;
1470    body;
1471    while (cond)
1472      body;
1473    end: ;
1474    */
1475 static void
1476 peel_loop_simple (struct loop *loop)
1477 {
1478   sbitmap wont_exit;
1479   unsigned npeel = loop->lpt_decision.times;
1480   struct niter_desc *desc = get_simple_loop_desc (loop);
1481   struct opt_info *opt_info = NULL;
1482   bool ok;
1483
1484   if (flag_split_ivs_in_unroller && npeel > 1)
1485     opt_info = analyze_insns_in_loop (loop);
1486
1487   wont_exit = sbitmap_alloc (npeel + 1);
1488   bitmap_clear (wont_exit);
1489
1490   opt_info_start_duplication (opt_info);
1491
1492   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1493                                       npeel, wont_exit, NULL,
1494                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1495                                       | (opt_info
1496                                          ? DLTHE_RECORD_COPY_NUMBER
1497                                            : 0));
1498   gcc_assert (ok);
1499
1500   free (wont_exit);
1501
1502   if (opt_info)
1503     {
1504       apply_opt_in_copies (opt_info, npeel, false, false);
1505       free_opt_info (opt_info);
1506     }
1507
1508   if (desc->simple_p)
1509     {
1510       if (desc->const_iter)
1511         {
1512           desc->niter -= npeel;
1513           desc->niter_expr = GEN_INT (desc->niter);
1514           desc->noloop_assumptions = NULL_RTX;
1515         }
1516       else
1517         {
1518           /* We cannot just update niter_expr, as its value might be clobbered
1519              inside loop.  We could handle this by counting the number into
1520              temporary just like we do in runtime unrolling, but it does not
1521              seem worthwhile.  */
1522           free_simple_loop_desc (loop);
1523         }
1524     }
1525   if (dump_file)
1526     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1527 }
1528
1529 /* Decide whether to unroll LOOP stupidly and how much.  */
1530 static void
1531 decide_unroll_stupid (struct loop *loop, int flags)
1532 {
1533   unsigned nunroll, nunroll_by_av, i;
1534   struct niter_desc *desc;
1535   widest_int iterations;
1536
1537   if (!(flags & UAP_UNROLL_ALL))
1538     {
1539       /* We were not asked to, just return back silently.  */
1540       return;
1541     }
1542
1543   if (dump_file)
1544     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1545
1546   /* nunroll = total number of copies of the original loop body in
1547      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1548   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1549   nunroll_by_av
1550     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1551   if (nunroll > nunroll_by_av)
1552     nunroll = nunroll_by_av;
1553   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1554     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1555
1556   if (targetm.loop_unroll_adjust)
1557     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1558
1559   /* Skip big loops.  */
1560   if (nunroll <= 1)
1561     {
1562       if (dump_file)
1563         fprintf (dump_file, ";; Not considering loop, is too big\n");
1564       return;
1565     }
1566
1567   /* Check for simple loops.  */
1568   desc = get_simple_loop_desc (loop);
1569
1570   /* Check simpleness.  */
1571   if (desc->simple_p && !desc->assumptions)
1572     {
1573       if (dump_file)
1574         fprintf (dump_file, ";; The loop is simple\n");
1575       return;
1576     }
1577
1578   /* Do not unroll loops with branches inside -- it increases number
1579      of mispredicts.
1580      TODO: this heuristic needs tunning; call inside the loop body
1581      is also relatively good reason to not unroll.  */
1582   if (num_loop_branches (loop) > 1)
1583     {
1584       if (dump_file)
1585         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1586       return;
1587     }
1588
1589   /* Check whether the loop rolls.  */
1590   if ((get_estimated_loop_iterations (loop, &iterations)
1591        || get_max_loop_iterations (loop, &iterations))
1592       && wi::ltu_p (iterations, 2 * nunroll))
1593     {
1594       if (dump_file)
1595         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1596       return;
1597     }
1598
1599   /* Success.  Now force nunroll to be power of 2, as it seems that this
1600      improves results (partially because of better alignments, partially
1601      because of some dark magic).  */
1602   for (i = 1; 2 * i <= nunroll; i *= 2)
1603     continue;
1604
1605   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1606   loop->lpt_decision.times = i - 1;
1607 }
1608
1609 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1610
1611    while (cond)
1612      body;
1613
1614    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1615
1616    while (cond)
1617      {
1618        body;
1619        if (!cond) break;
1620        body;
1621        if (!cond) break;
1622        body;
1623        if (!cond) break;
1624        body;
1625      }
1626    */
1627 static void
1628 unroll_loop_stupid (struct loop *loop)
1629 {
1630   sbitmap wont_exit;
1631   unsigned nunroll = loop->lpt_decision.times;
1632   struct niter_desc *desc = get_simple_loop_desc (loop);
1633   struct opt_info *opt_info = NULL;
1634   bool ok;
1635
1636   if (flag_split_ivs_in_unroller
1637       || flag_variable_expansion_in_unroller)
1638     opt_info = analyze_insns_in_loop (loop);
1639
1640
1641   wont_exit = sbitmap_alloc (nunroll + 1);
1642   bitmap_clear (wont_exit);
1643   opt_info_start_duplication (opt_info);
1644
1645   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1646                                       nunroll, wont_exit,
1647                                       NULL, NULL,
1648                                       DLTHE_FLAG_UPDATE_FREQ
1649                                       | (opt_info
1650                                          ? DLTHE_RECORD_COPY_NUMBER
1651                                            : 0));
1652   gcc_assert (ok);
1653
1654   if (opt_info)
1655     {
1656       apply_opt_in_copies (opt_info, nunroll, true, true);
1657       free_opt_info (opt_info);
1658     }
1659
1660   free (wont_exit);
1661
1662   if (desc->simple_p)
1663     {
1664       /* We indeed may get here provided that there are nontrivial assumptions
1665          for a loop to be really simple.  We could update the counts, but the
1666          problem is that we are unable to decide which exit will be taken
1667          (not really true in case the number of iterations is constant,
1668          but no one will do anything with this information, so we do not
1669          worry about it).  */
1670       desc->simple_p = false;
1671     }
1672
1673   if (dump_file)
1674     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1675              nunroll, num_loop_insns (loop));
1676 }
1677
1678 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1679    Set *DEBUG_USES to the number of debug insns that reference the
1680    variable.  */
1681
1682 bool
1683 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1684                                   int *debug_uses)
1685 {
1686   basic_block *body, bb;
1687   unsigned i;
1688   int count_ref = 0;
1689   rtx_insn *insn;
1690
1691   body = get_loop_body (loop);
1692   for (i = 0; i < loop->num_nodes; i++)
1693     {
1694       bb = body[i];
1695
1696       FOR_BB_INSNS (bb, insn)
1697         if (!rtx_referenced_p (reg, insn))
1698           continue;
1699         else if (DEBUG_INSN_P (insn))
1700           ++*debug_uses;
1701         else if (++count_ref > 1)
1702           break;
1703     }
1704   free (body);
1705   return (count_ref  == 1);
1706 }
1707
1708 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1709
1710 static void
1711 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1712 {
1713   basic_block *body, bb;
1714   unsigned i;
1715   rtx_insn *insn;
1716
1717   body = get_loop_body (loop);
1718   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1719     {
1720       bb = body[i];
1721
1722       FOR_BB_INSNS (bb, insn)
1723         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1724           continue;
1725         else
1726           {
1727             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1728                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1729             if (!--debug_uses)
1730               break;
1731           }
1732     }
1733   free (body);
1734 }
1735
1736 /* Determine whether INSN contains an accumulator
1737    which can be expanded into separate copies,
1738    one for each copy of the LOOP body.
1739
1740    for (i = 0 ; i < n; i++)
1741      sum += a[i];
1742
1743    ==>
1744
1745    sum += a[i]
1746    ....
1747    i = i+1;
1748    sum1 += a[i]
1749    ....
1750    i = i+1
1751    sum2 += a[i];
1752    ....
1753
1754    Return NULL if INSN contains no opportunity for expansion of accumulator.
1755    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1756    information and return a pointer to it.
1757 */
1758
1759 static struct var_to_expand *
1760 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1761 {
1762   rtx set, dest, src;
1763   struct var_to_expand *ves;
1764   unsigned accum_pos;
1765   enum rtx_code code;
1766   int debug_uses = 0;
1767
1768   set = single_set (insn);
1769   if (!set)
1770     return NULL;
1771
1772   dest = SET_DEST (set);
1773   src = SET_SRC (set);
1774   code = GET_CODE (src);
1775
1776   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1777     return NULL;
1778
1779   if (FLOAT_MODE_P (GET_MODE (dest)))
1780     {
1781       if (!flag_associative_math)
1782         return NULL;
1783       /* In the case of FMA, we're also changing the rounding.  */
1784       if (code == FMA && !flag_unsafe_math_optimizations)
1785         return NULL;
1786     }
1787
1788   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1789      in MD.  But if there is no optab to generate the insn, we can not
1790      perform the variable expansion.  This can happen if an MD provides
1791      an insn but not a named pattern to generate it, for example to avoid
1792      producing code that needs additional mode switches like for x87/mmx.
1793
1794      So we check have_insn_for which looks for an optab for the operation
1795      in SRC.  If it doesn't exist, we can't perform the expansion even
1796      though INSN is valid.  */
1797   if (!have_insn_for (code, GET_MODE (src)))
1798     return NULL;
1799
1800   if (!REG_P (dest)
1801       && !(GET_CODE (dest) == SUBREG
1802            && REG_P (SUBREG_REG (dest))))
1803     return NULL;
1804
1805   /* Find the accumulator use within the operation.  */
1806   if (code == FMA)
1807     {
1808       /* We only support accumulation via FMA in the ADD position.  */
1809       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1810         return NULL;
1811       accum_pos = 2;
1812     }
1813   else if (rtx_equal_p (dest, XEXP (src, 0)))
1814     accum_pos = 0;
1815   else if (rtx_equal_p (dest, XEXP (src, 1)))
1816     {
1817       /* The method of expansion that we are using; which includes the
1818          initialization of the expansions with zero and the summation of
1819          the expansions at the end of the computation will yield wrong
1820          results for (x = something - x) thus avoid using it in that case.  */
1821       if (code == MINUS)
1822         return NULL;
1823       accum_pos = 1;
1824     }
1825   else
1826     return NULL;
1827
1828   /* It must not otherwise be used.  */
1829   if (code == FMA)
1830     {
1831       if (rtx_referenced_p (dest, XEXP (src, 0))
1832           || rtx_referenced_p (dest, XEXP (src, 1)))
1833         return NULL;
1834     }
1835   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1836     return NULL;
1837
1838   /* It must be used in exactly one insn.  */
1839   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1840     return NULL;
1841
1842   if (dump_file)
1843     {
1844       fprintf (dump_file, "\n;; Expanding Accumulator ");
1845       print_rtl (dump_file, dest);
1846       fprintf (dump_file, "\n");
1847     }
1848
1849   if (debug_uses)
1850     /* Instead of resetting the debug insns, we could replace each
1851        debug use in the loop with the sum or product of all expanded
1852        accummulators.  Since we'll only know of all expansions at the
1853        end, we'd have to keep track of which vars_to_expand a debug
1854        insn in the loop references, take note of each copy of the
1855        debug insn during unrolling, and when it's all done, compute
1856        the sum or product of each variable and adjust the original
1857        debug insn and each copy thereof.  What a pain!  */
1858     reset_debug_uses_in_loop (loop, dest, debug_uses);
1859
1860   /* Record the accumulator to expand.  */
1861   ves = XNEW (struct var_to_expand);
1862   ves->insn = insn;
1863   ves->reg = copy_rtx (dest);
1864   ves->var_expansions.create (1);
1865   ves->next = NULL;
1866   ves->op = GET_CODE (src);
1867   ves->expansion_count = 0;
1868   ves->reuse_expansion = 0;
1869   return ves;
1870 }
1871
1872 /* Determine whether there is an induction variable in INSN that
1873    we would like to split during unrolling.
1874
1875    I.e. replace
1876
1877    i = i + 1;
1878    ...
1879    i = i + 1;
1880    ...
1881    i = i + 1;
1882    ...
1883
1884    type chains by
1885
1886    i0 = i + 1
1887    ...
1888    i = i0 + 1
1889    ...
1890    i = i0 + 2
1891    ...
1892
1893    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1894    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1895    pointer to it.  */
1896
1897 static struct iv_to_split *
1898 analyze_iv_to_split_insn (rtx_insn *insn)
1899 {
1900   rtx set, dest;
1901   struct rtx_iv iv;
1902   struct iv_to_split *ivts;
1903   bool ok;
1904
1905   /* For now we just split the basic induction variables.  Later this may be
1906      extended for example by selecting also addresses of memory references.  */
1907   set = single_set (insn);
1908   if (!set)
1909     return NULL;
1910
1911   dest = SET_DEST (set);
1912   if (!REG_P (dest))
1913     return NULL;
1914
1915   if (!biv_p (insn, dest))
1916     return NULL;
1917
1918   ok = iv_analyze_result (insn, dest, &iv);
1919
1920   /* This used to be an assert under the assumption that if biv_p returns
1921      true that iv_analyze_result must also return true.  However, that
1922      assumption is not strictly correct as evidenced by pr25569.
1923
1924      Returning NULL when iv_analyze_result returns false is safe and
1925      avoids the problems in pr25569 until the iv_analyze_* routines
1926      can be fixed, which is apparently hard and time consuming
1927      according to their author.  */
1928   if (! ok)
1929     return NULL;
1930
1931   if (iv.step == const0_rtx
1932       || iv.mode != iv.extend_mode)
1933     return NULL;
1934
1935   /* Record the insn to split.  */
1936   ivts = XNEW (struct iv_to_split);
1937   ivts->insn = insn;
1938   ivts->orig_var = dest;
1939   ivts->base_var = NULL_RTX;
1940   ivts->step = iv.step;
1941   ivts->next = NULL;
1942
1943   return ivts;
1944 }
1945
1946 /* Determines which of insns in LOOP can be optimized.
1947    Return a OPT_INFO struct with the relevant hash tables filled
1948    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1949    is undefined for the return value.  */
1950
1951 static struct opt_info *
1952 analyze_insns_in_loop (struct loop *loop)
1953 {
1954   basic_block *body, bb;
1955   unsigned i;
1956   struct opt_info *opt_info = XCNEW (struct opt_info);
1957   rtx_insn *insn;
1958   struct iv_to_split *ivts = NULL;
1959   struct var_to_expand *ves = NULL;
1960   iv_to_split **slot1;
1961   var_to_expand **slot2;
1962   vec<edge> edges = get_loop_exit_edges (loop);
1963   edge exit;
1964   bool can_apply = false;
1965
1966   iv_analysis_loop_init (loop);
1967
1968   body = get_loop_body (loop);
1969
1970   if (flag_split_ivs_in_unroller)
1971     {
1972       opt_info->insns_to_split
1973         = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1974       opt_info->iv_to_split_head = NULL;
1975       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1976     }
1977
1978   /* Record the loop exit bb and loop preheader before the unrolling.  */
1979   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1980
1981   if (edges.length () == 1)
1982     {
1983       exit = edges[0];
1984       if (!(exit->flags & EDGE_COMPLEX))
1985         {
1986           opt_info->loop_exit = split_edge (exit);
1987           can_apply = true;
1988         }
1989     }
1990
1991   if (flag_variable_expansion_in_unroller
1992       && can_apply)
1993     {
1994       opt_info->insns_with_var_to_expand
1995         = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1996       opt_info->var_to_expand_head = NULL;
1997       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1998     }
1999
2000   for (i = 0; i < loop->num_nodes; i++)
2001     {
2002       bb = body[i];
2003       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2004         continue;
2005
2006       FOR_BB_INSNS (bb, insn)
2007       {
2008         if (!INSN_P (insn))
2009           continue;
2010
2011         if (opt_info->insns_to_split)
2012           ivts = analyze_iv_to_split_insn (insn);
2013
2014         if (ivts)
2015           {
2016             slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
2017             gcc_assert (*slot1 == NULL);
2018             *slot1 = ivts;
2019             *opt_info->iv_to_split_tail = ivts;
2020             opt_info->iv_to_split_tail = &ivts->next;
2021             continue;
2022           }
2023
2024         if (opt_info->insns_with_var_to_expand)
2025           ves = analyze_insn_to_expand_var (loop, insn);
2026
2027         if (ves)
2028           {
2029             slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
2030             gcc_assert (*slot2 == NULL);
2031             *slot2 = ves;
2032             *opt_info->var_to_expand_tail = ves;
2033             opt_info->var_to_expand_tail = &ves->next;
2034           }
2035       }
2036     }
2037
2038   edges.release ();
2039   free (body);
2040   return opt_info;
2041 }
2042
2043 /* Called just before loop duplication.  Records start of duplicated area
2044    to OPT_INFO.  */
2045
2046 static void
2047 opt_info_start_duplication (struct opt_info *opt_info)
2048 {
2049   if (opt_info)
2050     opt_info->first_new_block = last_basic_block_for_fn (cfun);
2051 }
2052
2053 /* Determine the number of iterations between initialization of the base
2054    variable and the current copy (N_COPY).  N_COPIES is the total number
2055    of newly created copies.  UNROLLING is true if we are unrolling
2056    (not peeling) the loop.  */
2057
2058 static unsigned
2059 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2060 {
2061   if (unrolling)
2062     {
2063       /* If we are unrolling, initialization is done in the original loop
2064          body (number 0).  */
2065       return n_copy;
2066     }
2067   else
2068     {
2069       /* If we are peeling, the copy in that the initialization occurs has
2070          number 1.  The original loop (number 0) is the last.  */
2071       if (n_copy)
2072         return n_copy - 1;
2073       else
2074         return n_copies;
2075     }
2076 }
2077
2078 /* Allocate basic variable for the induction variable chain.  */
2079
2080 static void
2081 allocate_basic_variable (struct iv_to_split *ivts)
2082 {
2083   rtx expr = SET_SRC (single_set (ivts->insn));
2084
2085   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2086 }
2087
2088 /* Insert initialization of basic variable of IVTS before INSN, taking
2089    the initial value from INSN.  */
2090
2091 static void
2092 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
2093 {
2094   rtx expr = copy_rtx (SET_SRC (single_set (insn)));
2095   rtx_insn *seq;
2096
2097   start_sequence ();
2098   expr = force_operand (expr, ivts->base_var);
2099   if (expr != ivts->base_var)
2100     emit_move_insn (ivts->base_var, expr);
2101   seq = get_insns ();
2102   end_sequence ();
2103
2104   emit_insn_before (seq, insn);
2105 }
2106
2107 /* Replace the use of induction variable described in IVTS in INSN
2108    by base variable + DELTA * step.  */
2109
2110 static void
2111 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
2112 {
2113   rtx expr, *loc, incr, var;
2114   rtx_insn *seq;
2115   enum machine_mode mode = GET_MODE (ivts->base_var);
2116   rtx src, dest, set;
2117
2118   /* Construct base + DELTA * step.  */
2119   if (!delta)
2120     expr = ivts->base_var;
2121   else
2122     {
2123       incr = simplify_gen_binary (MULT, mode,
2124                                   ivts->step, gen_int_mode (delta, mode));
2125       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2126                                   ivts->base_var, incr);
2127     }
2128
2129   /* Figure out where to do the replacement.  */
2130   loc = &SET_SRC (single_set (insn));
2131
2132   /* If we can make the replacement right away, we're done.  */
2133   if (validate_change (insn, loc, expr, 0))
2134     return;
2135
2136   /* Otherwise, force EXPR into a register and try again.  */
2137   start_sequence ();
2138   var = gen_reg_rtx (mode);
2139   expr = force_operand (expr, var);
2140   if (expr != var)
2141     emit_move_insn (var, expr);
2142   seq = get_insns ();
2143   end_sequence ();
2144   emit_insn_before (seq, insn);
2145
2146   if (validate_change (insn, loc, var, 0))
2147     return;
2148
2149   /* The last chance.  Try recreating the assignment in insn
2150      completely from scratch.  */
2151   set = single_set (insn);
2152   gcc_assert (set);
2153
2154   start_sequence ();
2155   *loc = var;
2156   src = copy_rtx (SET_SRC (set));
2157   dest = copy_rtx (SET_DEST (set));
2158   src = force_operand (src, dest);
2159   if (src != dest)
2160     emit_move_insn (dest, src);
2161   seq = get_insns ();
2162   end_sequence ();
2163
2164   emit_insn_before (seq, insn);
2165   delete_insn (insn);
2166 }
2167
2168
2169 /* Return one expansion of the accumulator recorded in struct VE.  */
2170
2171 static rtx
2172 get_expansion (struct var_to_expand *ve)
2173 {
2174   rtx reg;
2175
2176   if (ve->reuse_expansion == 0)
2177     reg = ve->reg;
2178   else
2179     reg = ve->var_expansions[ve->reuse_expansion - 1];
2180
2181   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2182     ve->reuse_expansion = 0;
2183   else
2184     ve->reuse_expansion++;
2185
2186   return reg;
2187 }
2188
2189
2190 /* Given INSN replace the uses of the accumulator recorded in VE
2191    with a new register.  */
2192
2193 static void
2194 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
2195 {
2196   rtx new_reg, set;
2197   bool really_new_expansion = false;
2198
2199   set = single_set (insn);
2200   gcc_assert (set);
2201
2202   /* Generate a new register only if the expansion limit has not been
2203      reached.  Else reuse an already existing expansion.  */
2204   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2205     {
2206       really_new_expansion = true;
2207       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2208     }
2209   else
2210     new_reg = get_expansion (ve);
2211
2212   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2213   if (apply_change_group ())
2214     if (really_new_expansion)
2215       {
2216         ve->var_expansions.safe_push (new_reg);
2217         ve->expansion_count++;
2218       }
2219 }
2220
2221 /* Initialize the variable expansions in loop preheader.  PLACE is the
2222    loop-preheader basic block where the initialization of the
2223    expansions should take place.  The expansions are initialized with
2224    (-0) when the operation is plus or minus to honor sign zero.  This
2225    way we can prevent cases where the sign of the final result is
2226    effected by the sign of the expansion.  Here is an example to
2227    demonstrate this:
2228
2229    for (i = 0 ; i < n; i++)
2230      sum += something;
2231
2232    ==>
2233
2234    sum += something
2235    ....
2236    i = i+1;
2237    sum1 += something
2238    ....
2239    i = i+1
2240    sum2 += something;
2241    ....
2242
2243    When SUM is initialized with -zero and SOMETHING is also -zero; the
2244    final result of sum should be -zero thus the expansions sum1 and sum2
2245    should be initialized with -zero as well (otherwise we will get +zero
2246    as the final result).  */
2247
2248 static void
2249 insert_var_expansion_initialization (struct var_to_expand *ve,
2250                                      basic_block place)
2251 {
2252   rtx_insn *seq;
2253   rtx var, zero_init;
2254   unsigned i;
2255   enum machine_mode mode = GET_MODE (ve->reg);
2256   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2257
2258   if (ve->var_expansions.length () == 0)
2259     return;
2260
2261   start_sequence ();
2262   switch (ve->op)
2263     {
2264     case FMA:
2265       /* Note that we only accumulate FMA via the ADD operand.  */
2266     case PLUS:
2267     case MINUS:
2268       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2269         {
2270           if (honor_signed_zero_p)
2271             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2272           else
2273             zero_init = CONST0_RTX (mode);
2274           emit_move_insn (var, zero_init);
2275         }
2276       break;
2277
2278     case MULT:
2279       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2280         {
2281           zero_init = CONST1_RTX (GET_MODE (var));
2282           emit_move_insn (var, zero_init);
2283         }
2284       break;
2285
2286     default:
2287       gcc_unreachable ();
2288     }
2289
2290   seq = get_insns ();
2291   end_sequence ();
2292
2293   emit_insn_after (seq, BB_END (place));
2294 }
2295
2296 /* Combine the variable expansions at the loop exit.  PLACE is the
2297    loop exit basic block where the summation of the expansions should
2298    take place.  */
2299
2300 static void
2301 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2302 {
2303   rtx sum = ve->reg;
2304   rtx expr, var;
2305   rtx_insn *seq, *insn;
2306   unsigned i;
2307
2308   if (ve->var_expansions.length () == 0)
2309     return;
2310
2311   start_sequence ();
2312   switch (ve->op)
2313     {
2314     case FMA:
2315       /* Note that we only accumulate FMA via the ADD operand.  */
2316     case PLUS:
2317     case MINUS:
2318       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2319         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2320       break;
2321
2322     case MULT:
2323       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2324         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2325       break;
2326
2327     default:
2328       gcc_unreachable ();
2329     }
2330
2331   expr = force_operand (sum, ve->reg);
2332   if (expr != ve->reg)
2333     emit_move_insn (ve->reg, expr);
2334   seq = get_insns ();
2335   end_sequence ();
2336
2337   insn = BB_HEAD (place);
2338   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2339     insn = NEXT_INSN (insn);
2340
2341   emit_insn_after (seq, insn);
2342 }
2343
2344 /* Strip away REG_EQUAL notes for IVs we're splitting.
2345
2346    Updating REG_EQUAL notes for IVs we split is tricky: We
2347    cannot tell until after unrolling, DF-rescanning, and liveness
2348    updating, whether an EQ_USE is reached by the split IV while
2349    the IV reg is still live.  See PR55006.
2350
2351    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2352    because RTL loop-iv requires us to defer rescanning insns and
2353    any notes attached to them.  So resort to old techniques...  */
2354
2355 static void
2356 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
2357 {
2358   struct iv_to_split *ivts;
2359   rtx note = find_reg_equal_equiv_note (insn);
2360   if (! note)
2361     return;
2362   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2363     if (reg_mentioned_p (ivts->orig_var, note))
2364       {
2365         remove_note (insn, note);
2366         return;
2367       }
2368 }
2369
2370 /* Apply loop optimizations in loop copies using the
2371    data which gathered during the unrolling.  Structure
2372    OPT_INFO record that data.
2373
2374    UNROLLING is true if we unrolled (not peeled) the loop.
2375    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2376    the loop (as it should happen in complete unrolling, but not in ordinary
2377    peeling of the loop).  */
2378
2379 static void
2380 apply_opt_in_copies (struct opt_info *opt_info,
2381                      unsigned n_copies, bool unrolling,
2382                      bool rewrite_original_loop)
2383 {
2384   unsigned i, delta;
2385   basic_block bb, orig_bb;
2386   rtx_insn *insn, *orig_insn, *next;
2387   struct iv_to_split ivts_templ, *ivts;
2388   struct var_to_expand ve_templ, *ves;
2389
2390   /* Sanity check -- we need to put initialization in the original loop
2391      body.  */
2392   gcc_assert (!unrolling || rewrite_original_loop);
2393
2394   /* Allocate the basic variables (i0).  */
2395   if (opt_info->insns_to_split)
2396     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2397       allocate_basic_variable (ivts);
2398
2399   for (i = opt_info->first_new_block;
2400        i < (unsigned) last_basic_block_for_fn (cfun);
2401        i++)
2402     {
2403       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2404       orig_bb = get_bb_original (bb);
2405
2406       /* bb->aux holds position in copy sequence initialized by
2407          duplicate_loop_to_header_edge.  */
2408       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2409                                         unrolling);
2410       bb->aux = 0;
2411       orig_insn = BB_HEAD (orig_bb);
2412       FOR_BB_INSNS_SAFE (bb, insn, next)
2413         {
2414           if (!INSN_P (insn)
2415               || (DEBUG_INSN_P (insn)
2416                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2417             continue;
2418
2419           while (!INSN_P (orig_insn)
2420                  || (DEBUG_INSN_P (orig_insn)
2421                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2422                          == LABEL_DECL)))
2423             orig_insn = NEXT_INSN (orig_insn);
2424
2425           ivts_templ.insn = orig_insn;
2426           ve_templ.insn = orig_insn;
2427
2428           /* Apply splitting iv optimization.  */
2429           if (opt_info->insns_to_split)
2430             {
2431               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2432
2433               ivts = opt_info->insns_to_split->find (&ivts_templ);
2434
2435               if (ivts)
2436                 {
2437                   gcc_assert (GET_CODE (PATTERN (insn))
2438                               == GET_CODE (PATTERN (orig_insn)));
2439
2440                   if (!delta)
2441                     insert_base_initialization (ivts, insn);
2442                   split_iv (ivts, insn, delta);
2443                 }
2444             }
2445           /* Apply variable expansion optimization.  */
2446           if (unrolling && opt_info->insns_with_var_to_expand)
2447             {
2448               ves = (struct var_to_expand *)
2449                 opt_info->insns_with_var_to_expand->find (&ve_templ);
2450               if (ves)
2451                 {
2452                   gcc_assert (GET_CODE (PATTERN (insn))
2453                               == GET_CODE (PATTERN (orig_insn)));
2454                   expand_var_during_unrolling (ves, insn);
2455                 }
2456             }
2457           orig_insn = NEXT_INSN (orig_insn);
2458         }
2459     }
2460
2461   if (!rewrite_original_loop)
2462     return;
2463
2464   /* Initialize the variable expansions in the loop preheader
2465      and take care of combining them at the loop exit.  */
2466   if (opt_info->insns_with_var_to_expand)
2467     {
2468       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2469         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2470       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2471         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2472     }
2473
2474   /* Rewrite also the original loop body.  Find them as originals of the blocks
2475      in the last copied iteration, i.e. those that have
2476      get_bb_copy (get_bb_original (bb)) == bb.  */
2477   for (i = opt_info->first_new_block;
2478        i < (unsigned) last_basic_block_for_fn (cfun);
2479        i++)
2480     {
2481       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2482       orig_bb = get_bb_original (bb);
2483       if (get_bb_copy (orig_bb) != bb)
2484         continue;
2485
2486       delta = determine_split_iv_delta (0, n_copies, unrolling);
2487       for (orig_insn = BB_HEAD (orig_bb);
2488            orig_insn != NEXT_INSN (BB_END (bb));
2489            orig_insn = next)
2490         {
2491           next = NEXT_INSN (orig_insn);
2492
2493           if (!INSN_P (orig_insn))
2494             continue;
2495
2496           ivts_templ.insn = orig_insn;
2497           if (opt_info->insns_to_split)
2498             {
2499               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2500
2501               ivts = (struct iv_to_split *)
2502                 opt_info->insns_to_split->find (&ivts_templ);
2503               if (ivts)
2504                 {
2505                   if (!delta)
2506                     insert_base_initialization (ivts, orig_insn);
2507                   split_iv (ivts, orig_insn, delta);
2508                   continue;
2509                 }
2510             }
2511
2512         }
2513     }
2514 }
2515
2516 /* Release OPT_INFO.  */
2517
2518 static void
2519 free_opt_info (struct opt_info *opt_info)
2520 {
2521   delete opt_info->insns_to_split;
2522   opt_info->insns_to_split = NULL;
2523   if (opt_info->insns_with_var_to_expand)
2524     {
2525       struct var_to_expand *ves;
2526
2527       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2528         ves->var_expansions.release ();
2529       delete opt_info->insns_with_var_to_expand;
2530       opt_info->insns_with_var_to_expand = NULL;
2531     }
2532   free (opt_info);
2533 }