gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hashtab.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Information about optimization applied in
 106    the unrolled loop.  */
 107
 108 struct opt_info
 109 {
 110   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 111   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 112   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 113   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 114                                       to expand.  */
 115   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 116   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 117   unsigned first_new_block;        /* The first basic block that was
 118                                       duplicated.  */
 119   basic_block loop_exit;           /* The loop exit basic block.  */
 120   basic_block loop_preheader;      /* The loop preheader basic block.  */
 121 };
 122
 123 static void decide_unrolling_and_peeling (int);
 124 static void peel_loops_completely (int);
 125 static void decide_peel_simple (struct loop *, int);
 126 static void decide_peel_once_rolling (struct loop *, int);
 127 static void decide_peel_completely (struct loop *, int);
 128 static void decide_unroll_stupid (struct loop *, int);
 129 static void decide_unroll_constant_iterations (struct loop *, int);
 130 static void decide_unroll_runtime_iterations (struct loop *, int);
 131 static void peel_loop_simple (struct loop *);
 132 static void peel_loop_completely (struct loop *);
 133 static void unroll_loop_stupid (struct loop *);
 134 static void unroll_loop_constant_iterations (struct loop *);
 135 static void unroll_loop_runtime_iterations (struct loop *);
 136 static struct opt_info *analyze_insns_in_loop (struct loop *);
 137 static void opt_info_start_duplication (struct opt_info *);
 138 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 139 static void free_opt_info (struct opt_info *);
 140 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 141 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 142 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 143 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 144 static void insert_var_expansion_initialization (struct var_to_expand *,
 145                                                  basic_block);
 146 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 147                                              basic_block);
 148 static rtx get_expansion (struct var_to_expand *);
 149
 150 /* Emit a message summarizing the unroll or peel that will be
 151    performed for LOOP, along with the loop's location LOCUS, if
 152    appropriate given the dump or -fopt-info settings.  */
 153
 154 static void
 155 report_unroll_peel (struct loop *loop, location_t locus)
 156 {
 157   struct niter_desc *desc;
 158   int niters = 0;
 159   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 160
 161   if (!dump_enabled_p ())
 162     return;
 163
 164   /* In the special case where the loop never iterated, emit
 165      a different message so that we don't report an unroll by 0.
 166      This matches the equivalent message emitted during tree unrolling.  */
 167   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 168       && !loop->lpt_decision.times)
 169     {
 170       dump_printf_loc (report_flags, locus,
 171                        "Turned loop into non-loop; it never loops.\n");
 172       return;
 173     }
 174
 175   desc = get_simple_loop_desc (loop);
 176
 177   if (desc->const_iter)
 178     niters = desc->niter;
 179   else if (loop->header->count)
 180     niters = expected_loop_iterations (loop);
 181
 182   dump_printf_loc (report_flags, locus,
 183                    "%s loop %d times",
 184                    (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 185                     ?  "Completely unroll"
 186                     : (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 187                        ? "Peel" : "Unroll")),
 188                    loop->lpt_decision.times);
 189   if (profile_info)
 190     dump_printf (report_flags,
 191                  " (header execution count %d",
 192                  (int)loop->header->count);
 193   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 194     dump_printf (report_flags,
 195                  "%s%s iterations %d)",
 196                  profile_info ? ", " : " (",
 197                  desc->const_iter ? "const" : "average",
 198                  niters);
 199   else if (profile_info)
 200     dump_printf (report_flags, ")");
 201
 202   dump_printf (report_flags, "\n");
 203 }
 204
 205 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 206 void
 207 unroll_and_peel_loops (int flags)
 208 {
 209   struct loop *loop;
 210   bool check;
 211   loop_iterator li;
 212
 213   /* First perform complete loop peeling (it is almost surely a win,
 214      and affects parameters for further decision a lot).  */
 215   peel_loops_completely (flags);
 216
 217   /* Now decide rest of unrolling and peeling.  */
 218   decide_unrolling_and_peeling (flags);
 219
 220   /* Scan the loops, inner ones first.  */
 221   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 222     {
 223       check = true;
 224       /* And perform the appropriate transformations.  */
 225       switch (loop->lpt_decision.decision)
 226         {
 227         case LPT_PEEL_COMPLETELY:
 228           /* Already done.  */
 229           gcc_unreachable ();
 230         case LPT_PEEL_SIMPLE:
 231           peel_loop_simple (loop);
 232           break;
 233         case LPT_UNROLL_CONSTANT:
 234           unroll_loop_constant_iterations (loop);
 235           break;
 236         case LPT_UNROLL_RUNTIME:
 237           unroll_loop_runtime_iterations (loop);
 238           break;
 239         case LPT_UNROLL_STUPID:
 240           unroll_loop_stupid (loop);
 241           break;
 242         case LPT_NONE:
 243           check = false;
 244           break;
 245         default:
 246           gcc_unreachable ();
 247         }
 248       if (check)
 249         {
 250 #ifdef ENABLE_CHECKING
 251           verify_loop_structure ();
 252 #endif
 253         }
 254     }
 255
 256   iv_analysis_done ();
 257 }
 258
 259 /* Check whether exit of the LOOP is at the end of loop body.  */
 260
 261 static bool
 262 loop_exit_at_end_p (struct loop *loop)
 263 {
 264   struct niter_desc *desc = get_simple_loop_desc (loop);
 265   rtx insn;
 266
 267   if (desc->in_edge->dest != loop->latch)
 268     return false;
 269
 270   /* Check that the latch is empty.  */
 271   FOR_BB_INSNS (loop->latch, insn)
 272     {
 273       if (NONDEBUG_INSN_P (insn))
 274         return false;
 275     }
 276
 277   return true;
 278 }
 279
 280 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 281 static void
 282 peel_loops_completely (int flags)
 283 {
 284   struct loop *loop;
 285   loop_iterator li;
 286
 287   /* Scan the loops, the inner ones first.  */
 288   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 289     {
 290       loop->lpt_decision.decision = LPT_NONE;
 291       location_t locus = get_loop_location (loop);
 292
 293       if (dump_enabled_p ())
 294         dump_printf_loc (TDF_RTL, locus,
 295                          ";; *** Considering loop %d at BB %d for "
 296                          "complete peeling ***\n",
 297                          loop->num, loop->header->index);
 298
 299       loop->ninsns = num_loop_insns (loop);
 300
 301       decide_peel_once_rolling (loop, flags);
 302       if (loop->lpt_decision.decision == LPT_NONE)
 303         decide_peel_completely (loop, flags);
 304
 305       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 306         {
 307           report_unroll_peel (loop, locus);
 308           peel_loop_completely (loop);
 309 #ifdef ENABLE_CHECKING
 310           verify_loop_structure ();
 311 #endif
 312         }
 313     }
 314 }
 315
 316 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 317 static void
 318 decide_unrolling_and_peeling (int flags)
 319 {
 320   struct loop *loop;
 321   loop_iterator li;
 322
 323   /* Scan the loops, inner ones first.  */
 324   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 325     {
 326       loop->lpt_decision.decision = LPT_NONE;
 327       location_t locus = get_loop_location (loop);
 328
 329       if (dump_enabled_p ())
 330         dump_printf_loc (TDF_RTL, locus,
 331                          ";; *** Considering loop %d at BB %d for "
 332                          "unrolling and peeling ***\n",
 333                          loop->num, loop->header->index);
 334
 335       /* Do not peel cold areas.  */
 336       if (optimize_loop_for_size_p (loop))
 337         {
 338           if (dump_file)
 339             fprintf (dump_file, ";; Not considering loop, cold area\n");
 340           continue;
 341         }
 342
 343       /* Can the loop be manipulated?  */
 344       if (!can_duplicate_loop_p (loop))
 345         {
 346           if (dump_file)
 347             fprintf (dump_file,
 348                      ";; Not considering loop, cannot duplicate\n");
 349           continue;
 350         }
 351
 352       /* Skip non-innermost loops.  */
 353       if (loop->inner)
 354         {
 355           if (dump_file)
 356             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 357           continue;
 358         }
 359
 360       loop->ninsns = num_loop_insns (loop);
 361       loop->av_ninsns = average_num_loop_insns (loop);
 362
 363       /* Try transformations one by one in decreasing order of
 364          priority.  */
 365
 366       decide_unroll_constant_iterations (loop, flags);
 367       if (loop->lpt_decision.decision == LPT_NONE)
 368         decide_unroll_runtime_iterations (loop, flags);
 369       if (loop->lpt_decision.decision == LPT_NONE)
 370         decide_unroll_stupid (loop, flags);
 371       if (loop->lpt_decision.decision == LPT_NONE)
 372         decide_peel_simple (loop, flags);
 373
 374       report_unroll_peel (loop, locus);
 375     }
 376 }
 377
 378 /* Decide whether the LOOP is once rolling and suitable for complete
 379    peeling.  */
 380 static void
 381 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 382 {
 383   struct niter_desc *desc;
 384
 385   if (dump_file)
 386     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 387
 388   /* Is the loop small enough?  */
 389   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 390     {
 391       if (dump_file)
 392         fprintf (dump_file, ";; Not considering loop, is too big\n");
 393       return;
 394     }
 395
 396   /* Check for simple loops.  */
 397   desc = get_simple_loop_desc (loop);
 398
 399   /* Check number of iterations.  */
 400   if (!desc->simple_p
 401       || desc->assumptions
 402       || desc->infinite
 403       || !desc->const_iter
 404       || (desc->niter != 0
 405           && max_loop_iterations_int (loop) != 0))
 406     {
 407       if (dump_file)
 408         fprintf (dump_file,
 409                  ";; Unable to prove that the loop rolls exactly once\n");
 410       return;
 411     }
 412
 413   /* Success.  */
 414   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 415 }
 416
 417 /* Decide whether the LOOP is suitable for complete peeling.  */
 418 static void
 419 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 420 {
 421   unsigned npeel;
 422   struct niter_desc *desc;
 423
 424   if (dump_file)
 425     fprintf (dump_file, "\n;; Considering peeling completely\n");
 426
 427   /* Skip non-innermost loops.  */
 428   if (loop->inner)
 429     {
 430       if (dump_file)
 431         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 432       return;
 433     }
 434
 435   /* Do not peel cold areas.  */
 436   if (optimize_loop_for_size_p (loop))
 437     {
 438       if (dump_file)
 439         fprintf (dump_file, ";; Not considering loop, cold area\n");
 440       return;
 441     }
 442
 443   /* Can the loop be manipulated?  */
 444   if (!can_duplicate_loop_p (loop))
 445     {
 446       if (dump_file)
 447         fprintf (dump_file,
 448                  ";; Not considering loop, cannot duplicate\n");
 449       return;
 450     }
 451
 452   /* npeel = number of iterations to peel.  */
 453   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 454   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 455     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 456
 457   /* Is the loop small enough?  */
 458   if (!npeel)
 459     {
 460       if (dump_file)
 461         fprintf (dump_file, ";; Not considering loop, is too big\n");
 462       return;
 463     }
 464
 465   /* Check for simple loops.  */
 466   desc = get_simple_loop_desc (loop);
 467
 468   /* Check number of iterations.  */
 469   if (!desc->simple_p
 470       || desc->assumptions
 471       || !desc->const_iter
 472       || desc->infinite)
 473     {
 474       if (dump_file)
 475         fprintf (dump_file,
 476                  ";; Unable to prove that the loop iterates constant times\n");
 477       return;
 478     }
 479
 480   if (desc->niter > npeel - 1)
 481     {
 482       if (dump_file)
 483         {
 484           fprintf (dump_file,
 485                    ";; Not peeling loop completely, rolls too much (");
 486           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 487           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 488         }
 489       return;
 490     }
 491
 492   /* Success.  */
 493   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 494 }
 495
 496 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 497    completely.  The transformation done:
 498
 499    for (i = 0; i < 4; i++)
 500      body;
 501
 502    ==>
 503
 504    i = 0;
 505    body; i++;
 506    body; i++;
 507    body; i++;
 508    body; i++;
 509    */
 510 static void
 511 peel_loop_completely (struct loop *loop)
 512 {
 513   sbitmap wont_exit;
 514   unsigned HOST_WIDE_INT npeel;
 515   unsigned i;
 516   vec<edge> remove_edges;
 517   edge ein;
 518   struct niter_desc *desc = get_simple_loop_desc (loop);
 519   struct opt_info *opt_info = NULL;
 520
 521   npeel = desc->niter;
 522
 523   if (npeel)
 524     {
 525       bool ok;
 526
 527       wont_exit = sbitmap_alloc (npeel + 1);
 528       bitmap_ones (wont_exit);
 529       bitmap_clear_bit (wont_exit, 0);
 530       if (desc->noloop_assumptions)
 531         bitmap_clear_bit (wont_exit, 1);
 532
 533       remove_edges.create (0);
 534
 535       if (flag_split_ivs_in_unroller)
 536         opt_info = analyze_insns_in_loop (loop);
 537
 538       opt_info_start_duplication (opt_info);
 539       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 540                                           npeel,
 541                                           wont_exit, desc->out_edge,
 542                                           &remove_edges,
 543                                           DLTHE_FLAG_UPDATE_FREQ
 544                                           | DLTHE_FLAG_COMPLETTE_PEEL
 545                                           | (opt_info
 546                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 547       gcc_assert (ok);
 548
 549       free (wont_exit);
 550
 551       if (opt_info)
 552         {
 553           apply_opt_in_copies (opt_info, npeel, false, true);
 554           free_opt_info (opt_info);
 555         }
 556
 557       /* Remove the exit edges.  */
 558       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 559         remove_path (ein);
 560       remove_edges.release ();
 561     }
 562
 563   ein = desc->in_edge;
 564   free_simple_loop_desc (loop);
 565
 566   /* Now remove the unreachable part of the last iteration and cancel
 567      the loop.  */
 568   remove_path (ein);
 569
 570   if (dump_file)
 571     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 572 }
 573
 574 /* Decide whether to unroll LOOP iterating constant number of times
 575    and how much.  */
 576
 577 static void
 578 decide_unroll_constant_iterations (struct loop *loop, int flags)
 579 {
 580   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 581   struct niter_desc *desc;
 582   double_int iterations;
 583
 584   if (!(flags & UAP_UNROLL))
 585     {
 586       /* We were not asked to, just return back silently.  */
 587       return;
 588     }
 589
 590   if (dump_file)
 591     fprintf (dump_file,
 592              "\n;; Considering unrolling loop with constant "
 593              "number of iterations\n");
 594
 595   /* nunroll = total number of copies of the original loop body in
 596      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 597   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 598   nunroll_by_av
 599     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 600   if (nunroll > nunroll_by_av)
 601     nunroll = nunroll_by_av;
 602   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 603     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 604
 605   /* Skip big loops.  */
 606   if (nunroll <= 1)
 607     {
 608       if (dump_file)
 609         fprintf (dump_file, ";; Not considering loop, is too big\n");
 610       return;
 611     }
 612
 613   /* Check for simple loops.  */
 614   desc = get_simple_loop_desc (loop);
 615
 616   /* Check number of iterations.  */
 617   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 618     {
 619       if (dump_file)
 620         fprintf (dump_file,
 621                  ";; Unable to prove that the loop iterates constant times\n");
 622       return;
 623     }
 624
 625   /* Check whether the loop rolls enough to consider.
 626      Consult also loop bounds and profile; in the case the loop has more
 627      than one exit it may well loop less than determined maximal number
 628      of iterations.  */
 629   if (desc->niter < 2 * nunroll
 630       || ((estimated_loop_iterations (loop, &iterations)
 631            || max_loop_iterations (loop, &iterations))
 632           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 633     {
 634       if (dump_file)
 635         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 636       return;
 637     }
 638
 639   /* Success; now compute number of iterations to unroll.  We alter
 640      nunroll so that as few as possible copies of loop body are
 641      necessary, while still not decreasing the number of unrollings
 642      too much (at most by 1).  */
 643   best_copies = 2 * nunroll + 10;
 644
 645   i = 2 * nunroll + 2;
 646   if (i - 1 >= desc->niter)
 647     i = desc->niter - 2;
 648
 649   for (; i >= nunroll - 1; i--)
 650     {
 651       unsigned exit_mod = desc->niter % (i + 1);
 652
 653       if (!loop_exit_at_end_p (loop))
 654         n_copies = exit_mod + i + 1;
 655       else if (exit_mod != (unsigned) i
 656                || desc->noloop_assumptions != NULL_RTX)
 657         n_copies = exit_mod + i + 2;
 658       else
 659         n_copies = i + 1;
 660
 661       if (n_copies < best_copies)
 662         {
 663           best_copies = n_copies;
 664           best_unroll = i;
 665         }
 666     }
 667
 668   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 669   loop->lpt_decision.times = best_unroll;
 670 }
 671
 672 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 673    The transformation does this:
 674
 675    for (i = 0; i < 102; i++)
 676      body;
 677
 678    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 679
 680    i = 0;
 681    body; i++;
 682    body; i++;
 683    while (i < 102)
 684      {
 685        body; i++;
 686        body; i++;
 687        body; i++;
 688        body; i++;
 689      }
 690   */
 691 static void
 692 unroll_loop_constant_iterations (struct loop *loop)
 693 {
 694   unsigned HOST_WIDE_INT niter;
 695   unsigned exit_mod;
 696   sbitmap wont_exit;
 697   unsigned i;
 698   vec<edge> remove_edges;
 699   edge e;
 700   unsigned max_unroll = loop->lpt_decision.times;
 701   struct niter_desc *desc = get_simple_loop_desc (loop);
 702   bool exit_at_end = loop_exit_at_end_p (loop);
 703   struct opt_info *opt_info = NULL;
 704   bool ok;
 705
 706   niter = desc->niter;
 707
 708   /* Should not get here (such loop should be peeled instead).  */
 709   gcc_assert (niter > max_unroll + 1);
 710
 711   exit_mod = niter % (max_unroll + 1);
 712
 713   wont_exit = sbitmap_alloc (max_unroll + 1);
 714   bitmap_ones (wont_exit);
 715
 716   remove_edges.create (0);
 717   if (flag_split_ivs_in_unroller
 718       || flag_variable_expansion_in_unroller)
 719     opt_info = analyze_insns_in_loop (loop);
 720
 721   if (!exit_at_end)
 722     {
 723       /* The exit is not at the end of the loop; leave exit test
 724          in the first copy, so that the loops that start with test
 725          of exit condition have continuous body after unrolling.  */
 726
 727       if (dump_file)
 728         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 729
 730       /* Peel exit_mod iterations.  */
 731       bitmap_clear_bit (wont_exit, 0);
 732       if (desc->noloop_assumptions)
 733         bitmap_clear_bit (wont_exit, 1);
 734
 735       if (exit_mod)
 736         {
 737           opt_info_start_duplication (opt_info);
 738           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 739                                               exit_mod,
 740                                               wont_exit, desc->out_edge,
 741                                               &remove_edges,
 742                                               DLTHE_FLAG_UPDATE_FREQ
 743                                               | (opt_info && exit_mod > 1
 744                                                  ? DLTHE_RECORD_COPY_NUMBER
 745                                                    : 0));
 746           gcc_assert (ok);
 747
 748           if (opt_info && exit_mod > 1)
 749             apply_opt_in_copies (opt_info, exit_mod, false, false);
 750
 751           desc->noloop_assumptions = NULL_RTX;
 752           desc->niter -= exit_mod;
 753           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 754           if (loop->any_estimate
 755               && double_int::from_uhwi (exit_mod).ule
 756                    (loop->nb_iterations_estimate))
 757             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 758           else
 759             loop->any_estimate = false;
 760         }
 761
 762       bitmap_set_bit (wont_exit, 1);
 763     }
 764   else
 765     {
 766       /* Leave exit test in last copy, for the same reason as above if
 767          the loop tests the condition at the end of loop body.  */
 768
 769       if (dump_file)
 770         fprintf (dump_file, ";; Condition at end of loop.\n");
 771
 772       /* We know that niter >= max_unroll + 2; so we do not need to care of
 773          case when we would exit before reaching the loop.  So just peel
 774          exit_mod + 1 iterations.  */
 775       if (exit_mod != max_unroll
 776           || desc->noloop_assumptions)
 777         {
 778           bitmap_clear_bit (wont_exit, 0);
 779           if (desc->noloop_assumptions)
 780             bitmap_clear_bit (wont_exit, 1);
 781
 782           opt_info_start_duplication (opt_info);
 783           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 784                                               exit_mod + 1,
 785                                               wont_exit, desc->out_edge,
 786                                               &remove_edges,
 787                                               DLTHE_FLAG_UPDATE_FREQ
 788                                               | (opt_info && exit_mod > 0
 789                                                  ? DLTHE_RECORD_COPY_NUMBER
 790                                                    : 0));
 791           gcc_assert (ok);
 792
 793           if (opt_info && exit_mod > 0)
 794             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 795
 796           desc->niter -= exit_mod + 1;
 797           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 798           if (loop->any_estimate
 799               && double_int::from_uhwi (exit_mod + 1).ule
 800                    (loop->nb_iterations_estimate))
 801             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 802           else
 803             loop->any_estimate = false;
 804           desc->noloop_assumptions = NULL_RTX;
 805
 806           bitmap_set_bit (wont_exit, 0);
 807           bitmap_set_bit (wont_exit, 1);
 808         }
 809
 810       bitmap_clear_bit (wont_exit, max_unroll);
 811     }
 812
 813   /* Now unroll the loop.  */
 814
 815   opt_info_start_duplication (opt_info);
 816   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 817                                       max_unroll,
 818                                       wont_exit, desc->out_edge,
 819                                       &remove_edges,
 820                                       DLTHE_FLAG_UPDATE_FREQ
 821                                       | (opt_info
 822                                          ? DLTHE_RECORD_COPY_NUMBER
 823                                            : 0));
 824   gcc_assert (ok);
 825
 826   if (opt_info)
 827     {
 828       apply_opt_in_copies (opt_info, max_unroll, true, true);
 829       free_opt_info (opt_info);
 830     }
 831
 832   free (wont_exit);
 833
 834   if (exit_at_end)
 835     {
 836       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 837       /* Find a new in and out edge; they are in the last copy we have made.  */
 838
 839       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 840         {
 841           desc->out_edge = EDGE_SUCC (exit_block, 0);
 842           desc->in_edge = EDGE_SUCC (exit_block, 1);
 843         }
 844       else
 845         {
 846           desc->out_edge = EDGE_SUCC (exit_block, 1);
 847           desc->in_edge = EDGE_SUCC (exit_block, 0);
 848         }
 849     }
 850
 851   desc->niter /= max_unroll + 1;
 852   loop->nb_iterations_upper_bound
 853     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 854                                                                    + 1),
 855                                             TRUNC_DIV_EXPR);
 856   if (loop->any_estimate)
 857     loop->nb_iterations_estimate
 858       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 859                                                                   + 1),
 860                                            TRUNC_DIV_EXPR);
 861   desc->niter_expr = GEN_INT (desc->niter);
 862
 863   /* Remove the edges.  */
 864   FOR_EACH_VEC_ELT (remove_edges, i, e)
 865     remove_path (e);
 866   remove_edges.release ();
 867
 868   if (dump_file)
 869     fprintf (dump_file,
 870              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 871              max_unroll, num_loop_insns (loop));
 872 }
 873
 874 /* Decide whether to unroll LOOP iterating runtime computable number of times
 875    and how much.  */
 876 static void
 877 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 878 {
 879   unsigned nunroll, nunroll_by_av, i;
 880   struct niter_desc *desc;
 881   double_int iterations;
 882
 883   if (!(flags & UAP_UNROLL))
 884     {
 885       /* We were not asked to, just return back silently.  */
 886       return;
 887     }
 888
 889   if (dump_file)
 890     fprintf (dump_file,
 891              "\n;; Considering unrolling loop with runtime "
 892              "computable number of iterations\n");
 893
 894   /* nunroll = total number of copies of the original loop body in
 895      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 896   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 897   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 898   if (nunroll > nunroll_by_av)
 899     nunroll = nunroll_by_av;
 900   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 901     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 902
 903   if (targetm.loop_unroll_adjust)
 904     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 905
 906   /* Skip big loops.  */
 907   if (nunroll <= 1)
 908     {
 909       if (dump_file)
 910         fprintf (dump_file, ";; Not considering loop, is too big\n");
 911       return;
 912     }
 913
 914   /* Check for simple loops.  */
 915   desc = get_simple_loop_desc (loop);
 916
 917   /* Check simpleness.  */
 918   if (!desc->simple_p || desc->assumptions)
 919     {
 920       if (dump_file)
 921         fprintf (dump_file,
 922                  ";; Unable to prove that the number of iterations "
 923                  "can be counted in runtime\n");
 924       return;
 925     }
 926
 927   if (desc->const_iter)
 928     {
 929       if (dump_file)
 930         fprintf (dump_file, ";; Loop iterates constant times\n");
 931       return;
 932     }
 933
 934   /* Check whether the loop rolls.  */
 935   if ((estimated_loop_iterations (loop, &iterations)
 936        || max_loop_iterations (loop, &iterations))
 937       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 938     {
 939       if (dump_file)
 940         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 941       return;
 942     }
 943
 944   /* Success; now force nunroll to be power of 2, as we are unable to
 945      cope with overflows in computation of number of iterations.  */
 946   for (i = 1; 2 * i <= nunroll; i *= 2)
 947     continue;
 948
 949   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 950   loop->lpt_decision.times = i - 1;
 951 }
 952
 953 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 954    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 955    and NULL is returned instead.  */
 956
 957 basic_block
 958 split_edge_and_insert (edge e, rtx insns)
 959 {
 960   basic_block bb;
 961
 962   if (!insns)
 963     return NULL;
 964   bb = split_edge (e);
 965   emit_insn_after (insns, BB_END (bb));
 966
 967   /* ??? We used to assume that INSNS can contain control flow insns, and
 968      that we had to try to find sub basic blocks in BB to maintain a valid
 969      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 970      and call break_superblocks when going out of cfglayout mode.  But it
 971      turns out that this never happens; and that if it does ever happen,
 972      the TODO_verify_flow at the end of the RTL loop passes would fail.
 973
 974      There are two reasons why we expected we could have control flow insns
 975      in INSNS.  The first is when a comparison has to be done in parts, and
 976      the second is when the number of iterations is computed for loops with
 977      the number of iterations known at runtime.  In both cases, test cases
 978      to get control flow in INSNS appear to be impossible to construct:
 979
 980       * If do_compare_rtx_and_jump needs several branches to do comparison
 981         in a mode that needs comparison by parts, we cannot analyze the
 982         number of iterations of the loop, and we never get to unrolling it.
 983
 984       * The code in expand_divmod that was suspected to cause creation of
 985         branching code seems to be only accessed for signed division.  The
 986         divisions used by # of iterations analysis are always unsigned.
 987         Problems might arise on architectures that emits branching code
 988         for some operations that may appear in the unroller (especially
 989         for division), but we have no such architectures.
 990
 991      Considering all this, it was decided that we should for now assume
 992      that INSNS can in theory contain control flow insns, but in practice
 993      it never does.  So we don't handle the theoretical case, and should
 994      a real failure ever show up, we have a pretty good clue for how to
 995      fix it.  */
 996
 997   return bb;
 998 }
 999
1000 /* Unroll LOOP for which we are able to count number of iterations in runtime
1001    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1002    extra care for case n < 0):
1003
1004    for (i = 0; i < n; i++)
1005      body;
1006
1007    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1008
1009    i = 0;
1010    mod = n % 4;
1011
1012    switch (mod)
1013      {
1014        case 3:
1015          body; i++;
1016        case 2:
1017          body; i++;
1018        case 1:
1019          body; i++;
1020        case 0: ;
1021      }
1022
1023    while (i < n)
1024      {
1025        body; i++;
1026        body; i++;
1027        body; i++;
1028        body; i++;
1029      }
1030    */
1031 static void
1032 unroll_loop_runtime_iterations (struct loop *loop)
1033 {
1034   rtx old_niter, niter, init_code, branch_code, tmp;
1035   unsigned i, j, p;
1036   basic_block preheader, *body, swtch, ezc_swtch;
1037   vec<basic_block> dom_bbs;
1038   sbitmap wont_exit;
1039   int may_exit_copy;
1040   unsigned n_peel;
1041   vec<edge> remove_edges;
1042   edge e;
1043   bool extra_zero_check, last_may_exit;
1044   unsigned max_unroll = loop->lpt_decision.times;
1045   struct niter_desc *desc = get_simple_loop_desc (loop);
1046   bool exit_at_end = loop_exit_at_end_p (loop);
1047   struct opt_info *opt_info = NULL;
1048   bool ok;
1049
1050   if (flag_split_ivs_in_unroller
1051       || flag_variable_expansion_in_unroller)
1052     opt_info = analyze_insns_in_loop (loop);
1053
1054   /* Remember blocks whose dominators will have to be updated.  */
1055   dom_bbs.create (0);
1056
1057   body = get_loop_body (loop);
1058   for (i = 0; i < loop->num_nodes; i++)
1059     {
1060       vec<basic_block> ldom;
1061       basic_block bb;
1062
1063       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1064       FOR_EACH_VEC_ELT (ldom, j, bb)
1065         if (!flow_bb_inside_loop_p (loop, bb))
1066           dom_bbs.safe_push (bb);
1067
1068       ldom.release ();
1069     }
1070   free (body);
1071
1072   if (!exit_at_end)
1073     {
1074       /* Leave exit in first copy (for explanation why see comment in
1075          unroll_loop_constant_iterations).  */
1076       may_exit_copy = 0;
1077       n_peel = max_unroll - 1;
1078       extra_zero_check = true;
1079       last_may_exit = false;
1080     }
1081   else
1082     {
1083       /* Leave exit in last copy (for explanation why see comment in
1084          unroll_loop_constant_iterations).  */
1085       may_exit_copy = max_unroll;
1086       n_peel = max_unroll;
1087       extra_zero_check = false;
1088       last_may_exit = true;
1089     }
1090
1091   /* Get expression for number of iterations.  */
1092   start_sequence ();
1093   old_niter = niter = gen_reg_rtx (desc->mode);
1094   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1095   if (tmp != niter)
1096     emit_move_insn (niter, tmp);
1097
1098   /* Count modulo by ANDing it with max_unroll; we use the fact that
1099      the number of unrollings is a power of two, and thus this is correct
1100      even if there is overflow in the computation.  */
1101   niter = expand_simple_binop (desc->mode, AND,
1102                                niter,
1103                                GEN_INT (max_unroll),
1104                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1105
1106   init_code = get_insns ();
1107   end_sequence ();
1108   unshare_all_rtl_in_chain (init_code);
1109
1110   /* Precondition the loop.  */
1111   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1112
1113   remove_edges.create (0);
1114
1115   wont_exit = sbitmap_alloc (max_unroll + 2);
1116
1117   /* Peel the first copy of loop body (almost always we must leave exit test
1118      here; the only exception is when we have extra zero check and the number
1119      of iterations is reliable.  Also record the place of (possible) extra
1120      zero check.  */
1121   bitmap_clear (wont_exit);
1122   if (extra_zero_check
1123       && !desc->noloop_assumptions)
1124     bitmap_set_bit (wont_exit, 1);
1125   ezc_swtch = loop_preheader_edge (loop)->src;
1126   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1127                                       1, wont_exit, desc->out_edge,
1128                                       &remove_edges,
1129                                       DLTHE_FLAG_UPDATE_FREQ);
1130   gcc_assert (ok);
1131
1132   /* Record the place where switch will be built for preconditioning.  */
1133   swtch = split_edge (loop_preheader_edge (loop));
1134
1135   for (i = 0; i < n_peel; i++)
1136     {
1137       /* Peel the copy.  */
1138       bitmap_clear (wont_exit);
1139       if (i != n_peel - 1 || !last_may_exit)
1140         bitmap_set_bit (wont_exit, 1);
1141       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1142                                           1, wont_exit, desc->out_edge,
1143                                           &remove_edges,
1144                                           DLTHE_FLAG_UPDATE_FREQ);
1145       gcc_assert (ok);
1146
1147       /* Create item for switch.  */
1148       j = n_peel - i - (extra_zero_check ? 0 : 1);
1149       p = REG_BR_PROB_BASE / (i + 2);
1150
1151       preheader = split_edge (loop_preheader_edge (loop));
1152       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1153                                           block_label (preheader), p,
1154                                           NULL_RTX);
1155
1156       /* We rely on the fact that the compare and jump cannot be optimized out,
1157          and hence the cfg we create is correct.  */
1158       gcc_assert (branch_code != NULL_RTX);
1159
1160       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1161       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1162       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1163       e = make_edge (swtch, preheader,
1164                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1165       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1166       e->probability = p;
1167     }
1168
1169   if (extra_zero_check)
1170     {
1171       /* Add branch for zero iterations.  */
1172       p = REG_BR_PROB_BASE / (max_unroll + 1);
1173       swtch = ezc_swtch;
1174       preheader = split_edge (loop_preheader_edge (loop));
1175       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1176                                           block_label (preheader), p,
1177                                           NULL_RTX);
1178       gcc_assert (branch_code != NULL_RTX);
1179
1180       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1181       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1182       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1183       e = make_edge (swtch, preheader,
1184                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1185       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1186       e->probability = p;
1187     }
1188
1189   /* Recount dominators for outer blocks.  */
1190   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1191
1192   /* And unroll loop.  */
1193
1194   bitmap_ones (wont_exit);
1195   bitmap_clear_bit (wont_exit, may_exit_copy);
1196   opt_info_start_duplication (opt_info);
1197
1198   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1199                                       max_unroll,
1200                                       wont_exit, desc->out_edge,
1201                                       &remove_edges,
1202                                       DLTHE_FLAG_UPDATE_FREQ
1203                                       | (opt_info
1204                                          ? DLTHE_RECORD_COPY_NUMBER
1205                                            : 0));
1206   gcc_assert (ok);
1207
1208   if (opt_info)
1209     {
1210       apply_opt_in_copies (opt_info, max_unroll, true, true);
1211       free_opt_info (opt_info);
1212     }
1213
1214   free (wont_exit);
1215
1216   if (exit_at_end)
1217     {
1218       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1219       /* Find a new in and out edge; they are in the last copy we have
1220          made.  */
1221
1222       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1223         {
1224           desc->out_edge = EDGE_SUCC (exit_block, 0);
1225           desc->in_edge = EDGE_SUCC (exit_block, 1);
1226         }
1227       else
1228         {
1229           desc->out_edge = EDGE_SUCC (exit_block, 1);
1230           desc->in_edge = EDGE_SUCC (exit_block, 0);
1231         }
1232     }
1233
1234   /* Remove the edges.  */
1235   FOR_EACH_VEC_ELT (remove_edges, i, e)
1236     remove_path (e);
1237   remove_edges.release ();
1238
1239   /* We must be careful when updating the number of iterations due to
1240      preconditioning and the fact that the value must be valid at entry
1241      of the loop.  After passing through the above code, we see that
1242      the correct new number of iterations is this:  */
1243   gcc_assert (!desc->const_iter);
1244   desc->niter_expr =
1245     simplify_gen_binary (UDIV, desc->mode, old_niter,
1246                          GEN_INT (max_unroll + 1));
1247   loop->nb_iterations_upper_bound
1248     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1249                                                                    + 1),
1250                                             TRUNC_DIV_EXPR);
1251   if (loop->any_estimate)
1252     loop->nb_iterations_estimate
1253       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1254                                                                   + 1),
1255                                            TRUNC_DIV_EXPR);
1256   if (exit_at_end)
1257     {
1258       desc->niter_expr =
1259         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1260       desc->noloop_assumptions = NULL_RTX;
1261       --loop->nb_iterations_upper_bound;
1262       if (loop->any_estimate
1263           && loop->nb_iterations_estimate != double_int_zero)
1264         --loop->nb_iterations_estimate;
1265       else
1266         loop->any_estimate = false;
1267     }
1268
1269   if (dump_file)
1270     fprintf (dump_file,
1271              ";; Unrolled loop %d times, counting # of iterations "
1272              "in runtime, %i insns\n",
1273              max_unroll, num_loop_insns (loop));
1274
1275   dom_bbs.release ();
1276 }
1277
1278 /* Decide whether to simply peel LOOP and how much.  */
1279 static void
1280 decide_peel_simple (struct loop *loop, int flags)
1281 {
1282   unsigned npeel;
1283   double_int iterations;
1284
1285   if (!(flags & UAP_PEEL))
1286     {
1287       /* We were not asked to, just return back silently.  */
1288       return;
1289     }
1290
1291   if (dump_file)
1292     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1293
1294   /* npeel = number of iterations to peel.  */
1295   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1296   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1297     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1298
1299   /* Skip big loops.  */
1300   if (!npeel)
1301     {
1302       if (dump_file)
1303         fprintf (dump_file, ";; Not considering loop, is too big\n");
1304       return;
1305     }
1306
1307   /* Do not simply peel loops with branches inside -- it increases number
1308      of mispredicts.
1309      Exception is when we do have profile and we however have good chance
1310      to peel proper number of iterations loop will iterate in practice.
1311      TODO: this heuristic needs tunning; while for complette unrolling
1312      the branch inside loop mostly eliminates any improvements, for
1313      peeling it is not the case.  Also a function call inside loop is
1314      also branch from branch prediction POV (and probably better reason
1315      to not unroll/peel).  */
1316   if (num_loop_branches (loop) > 1
1317       && profile_status != PROFILE_READ)
1318     {
1319       if (dump_file)
1320         fprintf (dump_file, ";; Not peeling, contains branches\n");
1321       return;
1322     }
1323
1324   /* If we have realistic estimate on number of iterations, use it.  */
1325   if (estimated_loop_iterations (loop, &iterations))
1326     {
1327       if (double_int::from_shwi (npeel).ule (iterations))
1328         {
1329           if (dump_file)
1330             {
1331               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1332               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1333                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1334               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1335                        npeel);
1336             }
1337           return;
1338         }
1339       npeel = iterations.to_shwi () + 1;
1340     }
1341   /* If we have small enough bound on iterations, we can still peel (completely
1342      unroll).  */
1343   else if (max_loop_iterations (loop, &iterations)
1344            && iterations.ult (double_int::from_shwi (npeel)))
1345     npeel = iterations.to_shwi () + 1;
1346   else
1347     {
1348       /* For now we have no good heuristics to decide whether loop peeling
1349          will be effective, so disable it.  */
1350       if (dump_file)
1351         fprintf (dump_file,
1352                  ";; Not peeling loop, no evidence it will be profitable\n");
1353       return;
1354     }
1355
1356   /* Success.  */
1357   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1358   loop->lpt_decision.times = npeel;
1359 }
1360
1361 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1362
1363    while (cond)
1364      body;
1365
1366    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1367
1368    if (!cond) goto end;
1369    body;
1370    if (!cond) goto end;
1371    body;
1372    if (!cond) goto end;
1373    body;
1374    while (cond)
1375      body;
1376    end: ;
1377    */
1378 static void
1379 peel_loop_simple (struct loop *loop)
1380 {
1381   sbitmap wont_exit;
1382   unsigned npeel = loop->lpt_decision.times;
1383   struct niter_desc *desc = get_simple_loop_desc (loop);
1384   struct opt_info *opt_info = NULL;
1385   bool ok;
1386
1387   if (flag_split_ivs_in_unroller && npeel > 1)
1388     opt_info = analyze_insns_in_loop (loop);
1389
1390   wont_exit = sbitmap_alloc (npeel + 1);
1391   bitmap_clear (wont_exit);
1392
1393   opt_info_start_duplication (opt_info);
1394
1395   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1396                                       npeel, wont_exit, NULL,
1397                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1398                                       | (opt_info
1399                                          ? DLTHE_RECORD_COPY_NUMBER
1400                                            : 0));
1401   gcc_assert (ok);
1402
1403   free (wont_exit);
1404
1405   if (opt_info)
1406     {
1407       apply_opt_in_copies (opt_info, npeel, false, false);
1408       free_opt_info (opt_info);
1409     }
1410
1411   if (desc->simple_p)
1412     {
1413       if (desc->const_iter)
1414         {
1415           desc->niter -= npeel;
1416           desc->niter_expr = GEN_INT (desc->niter);
1417           desc->noloop_assumptions = NULL_RTX;
1418         }
1419       else
1420         {
1421           /* We cannot just update niter_expr, as its value might be clobbered
1422              inside loop.  We could handle this by counting the number into
1423              temporary just like we do in runtime unrolling, but it does not
1424              seem worthwhile.  */
1425           free_simple_loop_desc (loop);
1426         }
1427     }
1428   if (dump_file)
1429     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1430 }
1431
1432 /* Decide whether to unroll LOOP stupidly and how much.  */
1433 static void
1434 decide_unroll_stupid (struct loop *loop, int flags)
1435 {
1436   unsigned nunroll, nunroll_by_av, i;
1437   struct niter_desc *desc;
1438   double_int iterations;
1439
1440   if (!(flags & UAP_UNROLL_ALL))
1441     {
1442       /* We were not asked to, just return back silently.  */
1443       return;
1444     }
1445
1446   if (dump_file)
1447     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1448
1449   /* nunroll = total number of copies of the original loop body in
1450      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1451   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1452   nunroll_by_av
1453     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1454   if (nunroll > nunroll_by_av)
1455     nunroll = nunroll_by_av;
1456   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1457     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1458
1459   if (targetm.loop_unroll_adjust)
1460     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1461
1462   /* Skip big loops.  */
1463   if (nunroll <= 1)
1464     {
1465       if (dump_file)
1466         fprintf (dump_file, ";; Not considering loop, is too big\n");
1467       return;
1468     }
1469
1470   /* Check for simple loops.  */
1471   desc = get_simple_loop_desc (loop);
1472
1473   /* Check simpleness.  */
1474   if (desc->simple_p && !desc->assumptions)
1475     {
1476       if (dump_file)
1477         fprintf (dump_file, ";; The loop is simple\n");
1478       return;
1479     }
1480
1481   /* Do not unroll loops with branches inside -- it increases number
1482      of mispredicts.
1483      TODO: this heuristic needs tunning; call inside the loop body
1484      is also relatively good reason to not unroll.  */
1485   if (num_loop_branches (loop) > 1)
1486     {
1487       if (dump_file)
1488         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1489       return;
1490     }
1491
1492   /* Check whether the loop rolls.  */
1493   if ((estimated_loop_iterations (loop, &iterations)
1494        || max_loop_iterations (loop, &iterations))
1495       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1496     {
1497       if (dump_file)
1498         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1499       return;
1500     }
1501
1502   /* Success.  Now force nunroll to be power of 2, as it seems that this
1503      improves results (partially because of better alignments, partially
1504      because of some dark magic).  */
1505   for (i = 1; 2 * i <= nunroll; i *= 2)
1506     continue;
1507
1508   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1509   loop->lpt_decision.times = i - 1;
1510 }
1511
1512 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1513
1514    while (cond)
1515      body;
1516
1517    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1518
1519    while (cond)
1520      {
1521        body;
1522        if (!cond) break;
1523        body;
1524        if (!cond) break;
1525        body;
1526        if (!cond) break;
1527        body;
1528      }
1529    */
1530 static void
1531 unroll_loop_stupid (struct loop *loop)
1532 {
1533   sbitmap wont_exit;
1534   unsigned nunroll = loop->lpt_decision.times;
1535   struct niter_desc *desc = get_simple_loop_desc (loop);
1536   struct opt_info *opt_info = NULL;
1537   bool ok;
1538
1539   if (flag_split_ivs_in_unroller
1540       || flag_variable_expansion_in_unroller)
1541     opt_info = analyze_insns_in_loop (loop);
1542
1543
1544   wont_exit = sbitmap_alloc (nunroll + 1);
1545   bitmap_clear (wont_exit);
1546   opt_info_start_duplication (opt_info);
1547
1548   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1549                                       nunroll, wont_exit,
1550                                       NULL, NULL,
1551                                       DLTHE_FLAG_UPDATE_FREQ
1552                                       | (opt_info
1553                                          ? DLTHE_RECORD_COPY_NUMBER
1554                                            : 0));
1555   gcc_assert (ok);
1556
1557   if (opt_info)
1558     {
1559       apply_opt_in_copies (opt_info, nunroll, true, true);
1560       free_opt_info (opt_info);
1561     }
1562
1563   free (wont_exit);
1564
1565   if (desc->simple_p)
1566     {
1567       /* We indeed may get here provided that there are nontrivial assumptions
1568          for a loop to be really simple.  We could update the counts, but the
1569          problem is that we are unable to decide which exit will be taken
1570          (not really true in case the number of iterations is constant,
1571          but noone will do anything with this information, so we do not
1572          worry about it).  */
1573       desc->simple_p = false;
1574     }
1575
1576   if (dump_file)
1577     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1578              nunroll, num_loop_insns (loop));
1579 }
1580
1581 /* A hash function for information about insns to split.  */
1582
1583 static hashval_t
1584 si_info_hash (const void *ivts)
1585 {
1586   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1587 }
1588
1589 /* An equality functions for information about insns to split.  */
1590
1591 static int
1592 si_info_eq (const void *ivts1, const void *ivts2)
1593 {
1594   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1595   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1596
1597   return i1->insn == i2->insn;
1598 }
1599
1600 /* Return a hash for VES, which is really a "var_to_expand *".  */
1601
1602 static hashval_t
1603 ve_info_hash (const void *ves)
1604 {
1605   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1606 }
1607
1608 /* Return true if IVTS1 and IVTS2 (which are really both of type
1609    "var_to_expand *") refer to the same instruction.  */
1610
1611 static int
1612 ve_info_eq (const void *ivts1, const void *ivts2)
1613 {
1614   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1615   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1616
1617   return i1->insn == i2->insn;
1618 }
1619
1620 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1621    Set *DEBUG_USES to the number of debug insns that reference the
1622    variable.  */
1623
1624 bool
1625 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1626                                   int *debug_uses)
1627 {
1628   basic_block *body, bb;
1629   unsigned i;
1630   int count_ref = 0;
1631   rtx insn;
1632
1633   body = get_loop_body (loop);
1634   for (i = 0; i < loop->num_nodes; i++)
1635     {
1636       bb = body[i];
1637
1638       FOR_BB_INSNS (bb, insn)
1639         if (!rtx_referenced_p (reg, insn))
1640           continue;
1641         else if (DEBUG_INSN_P (insn))
1642           ++*debug_uses;
1643         else if (++count_ref > 1)
1644           break;
1645     }
1646   free (body);
1647   return (count_ref  == 1);
1648 }
1649
1650 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1651
1652 static void
1653 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1654 {
1655   basic_block *body, bb;
1656   unsigned i;
1657   rtx insn;
1658
1659   body = get_loop_body (loop);
1660   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1661     {
1662       bb = body[i];
1663
1664       FOR_BB_INSNS (bb, insn)
1665         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1666           continue;
1667         else
1668           {
1669             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1670                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1671             if (!--debug_uses)
1672               break;
1673           }
1674     }
1675   free (body);
1676 }
1677
1678 /* Determine whether INSN contains an accumulator
1679    which can be expanded into separate copies,
1680    one for each copy of the LOOP body.
1681
1682    for (i = 0 ; i < n; i++)
1683      sum += a[i];
1684
1685    ==>
1686
1687    sum += a[i]
1688    ....
1689    i = i+1;
1690    sum1 += a[i]
1691    ....
1692    i = i+1
1693    sum2 += a[i];
1694    ....
1695
1696    Return NULL if INSN contains no opportunity for expansion of accumulator.
1697    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1698    information and return a pointer to it.
1699 */
1700
1701 static struct var_to_expand *
1702 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1703 {
1704   rtx set, dest, src;
1705   struct var_to_expand *ves;
1706   unsigned accum_pos;
1707   enum rtx_code code;
1708   int debug_uses = 0;
1709
1710   set = single_set (insn);
1711   if (!set)
1712     return NULL;
1713
1714   dest = SET_DEST (set);
1715   src = SET_SRC (set);
1716   code = GET_CODE (src);
1717
1718   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1719     return NULL;
1720
1721   if (FLOAT_MODE_P (GET_MODE (dest)))
1722     {
1723       if (!flag_associative_math)
1724         return NULL;
1725       /* In the case of FMA, we're also changing the rounding.  */
1726       if (code == FMA && !flag_unsafe_math_optimizations)
1727         return NULL;
1728     }
1729
1730   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1731      in MD.  But if there is no optab to generate the insn, we can not
1732      perform the variable expansion.  This can happen if an MD provides
1733      an insn but not a named pattern to generate it, for example to avoid
1734      producing code that needs additional mode switches like for x87/mmx.
1735
1736      So we check have_insn_for which looks for an optab for the operation
1737      in SRC.  If it doesn't exist, we can't perform the expansion even
1738      though INSN is valid.  */
1739   if (!have_insn_for (code, GET_MODE (src)))
1740     return NULL;
1741
1742   if (!REG_P (dest)
1743       && !(GET_CODE (dest) == SUBREG
1744            && REG_P (SUBREG_REG (dest))))
1745     return NULL;
1746
1747   /* Find the accumulator use within the operation.  */
1748   if (code == FMA)
1749     {
1750       /* We only support accumulation via FMA in the ADD position.  */
1751       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1752         return NULL;
1753       accum_pos = 2;
1754     }
1755   else if (rtx_equal_p (dest, XEXP (src, 0)))
1756     accum_pos = 0;
1757   else if (rtx_equal_p (dest, XEXP (src, 1)))
1758     {
1759       /* The method of expansion that we are using; which includes the
1760          initialization of the expansions with zero and the summation of
1761          the expansions at the end of the computation will yield wrong
1762          results for (x = something - x) thus avoid using it in that case.  */
1763       if (code == MINUS)
1764         return NULL;
1765       accum_pos = 1;
1766     }
1767   else
1768     return NULL;
1769
1770   /* It must not otherwise be used.  */
1771   if (code == FMA)
1772     {
1773       if (rtx_referenced_p (dest, XEXP (src, 0))
1774           || rtx_referenced_p (dest, XEXP (src, 1)))
1775         return NULL;
1776     }
1777   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1778     return NULL;
1779
1780   /* It must be used in exactly one insn.  */
1781   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1782     return NULL;
1783
1784   if (dump_file)
1785     {
1786       fprintf (dump_file, "\n;; Expanding Accumulator ");
1787       print_rtl (dump_file, dest);
1788       fprintf (dump_file, "\n");
1789     }
1790
1791   if (debug_uses)
1792     /* Instead of resetting the debug insns, we could replace each
1793        debug use in the loop with the sum or product of all expanded
1794        accummulators.  Since we'll only know of all expansions at the
1795        end, we'd have to keep track of which vars_to_expand a debug
1796        insn in the loop references, take note of each copy of the
1797        debug insn during unrolling, and when it's all done, compute
1798        the sum or product of each variable and adjust the original
1799        debug insn and each copy thereof.  What a pain!  */
1800     reset_debug_uses_in_loop (loop, dest, debug_uses);
1801
1802   /* Record the accumulator to expand.  */
1803   ves = XNEW (struct var_to_expand);
1804   ves->insn = insn;
1805   ves->reg = copy_rtx (dest);
1806   ves->var_expansions.create (1);
1807   ves->next = NULL;
1808   ves->op = GET_CODE (src);
1809   ves->expansion_count = 0;
1810   ves->reuse_expansion = 0;
1811   return ves;
1812 }
1813
1814 /* Determine whether there is an induction variable in INSN that
1815    we would like to split during unrolling.
1816
1817    I.e. replace
1818
1819    i = i + 1;
1820    ...
1821    i = i + 1;
1822    ...
1823    i = i + 1;
1824    ...
1825
1826    type chains by
1827
1828    i0 = i + 1
1829    ...
1830    i = i0 + 1
1831    ...
1832    i = i0 + 2
1833    ...
1834
1835    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1836    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1837    pointer to it.  */
1838
1839 static struct iv_to_split *
1840 analyze_iv_to_split_insn (rtx insn)
1841 {
1842   rtx set, dest;
1843   struct rtx_iv iv;
1844   struct iv_to_split *ivts;
1845   bool ok;
1846
1847   /* For now we just split the basic induction variables.  Later this may be
1848      extended for example by selecting also addresses of memory references.  */
1849   set = single_set (insn);
1850   if (!set)
1851     return NULL;
1852
1853   dest = SET_DEST (set);
1854   if (!REG_P (dest))
1855     return NULL;
1856
1857   if (!biv_p (insn, dest))
1858     return NULL;
1859
1860   ok = iv_analyze_result (insn, dest, &iv);
1861
1862   /* This used to be an assert under the assumption that if biv_p returns
1863      true that iv_analyze_result must also return true.  However, that
1864      assumption is not strictly correct as evidenced by pr25569.
1865
1866      Returning NULL when iv_analyze_result returns false is safe and
1867      avoids the problems in pr25569 until the iv_analyze_* routines
1868      can be fixed, which is apparently hard and time consuming
1869      according to their author.  */
1870   if (! ok)
1871     return NULL;
1872
1873   if (iv.step == const0_rtx
1874       || iv.mode != iv.extend_mode)
1875     return NULL;
1876
1877   /* Record the insn to split.  */
1878   ivts = XNEW (struct iv_to_split);
1879   ivts->insn = insn;
1880   ivts->orig_var = dest;
1881   ivts->base_var = NULL_RTX;
1882   ivts->step = iv.step;
1883   ivts->next = NULL;
1884   ivts->n_loc = 1;
1885   ivts->loc[0] = 1;
1886
1887   return ivts;
1888 }
1889
1890 /* Determines which of insns in LOOP can be optimized.
1891    Return a OPT_INFO struct with the relevant hash tables filled
1892    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1893    is undefined for the return value.  */
1894
1895 static struct opt_info *
1896 analyze_insns_in_loop (struct loop *loop)
1897 {
1898   basic_block *body, bb;
1899   unsigned i;
1900   struct opt_info *opt_info = XCNEW (struct opt_info);
1901   rtx insn;
1902   struct iv_to_split *ivts = NULL;
1903   struct var_to_expand *ves = NULL;
1904   PTR *slot1;
1905   PTR *slot2;
1906   vec<edge> edges = get_loop_exit_edges (loop);
1907   edge exit;
1908   bool can_apply = false;
1909
1910   iv_analysis_loop_init (loop);
1911
1912   body = get_loop_body (loop);
1913
1914   if (flag_split_ivs_in_unroller)
1915     {
1916       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1917                                               si_info_hash, si_info_eq, free);
1918       opt_info->iv_to_split_head = NULL;
1919       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1920     }
1921
1922   /* Record the loop exit bb and loop preheader before the unrolling.  */
1923   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1924
1925   if (edges.length () == 1)
1926     {
1927       exit = edges[0];
1928       if (!(exit->flags & EDGE_COMPLEX))
1929         {
1930           opt_info->loop_exit = split_edge (exit);
1931           can_apply = true;
1932         }
1933     }
1934
1935   if (flag_variable_expansion_in_unroller
1936       && can_apply)
1937     {
1938       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1939                                                         ve_info_hash,
1940                                                         ve_info_eq, free);
1941       opt_info->var_to_expand_head = NULL;
1942       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1943     }
1944
1945   for (i = 0; i < loop->num_nodes; i++)
1946     {
1947       bb = body[i];
1948       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1949         continue;
1950
1951       FOR_BB_INSNS (bb, insn)
1952       {
1953         if (!INSN_P (insn))
1954           continue;
1955
1956         if (opt_info->insns_to_split)
1957           ivts = analyze_iv_to_split_insn (insn);
1958
1959         if (ivts)
1960           {
1961             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1962             gcc_assert (*slot1 == NULL);
1963             *slot1 = ivts;
1964             *opt_info->iv_to_split_tail = ivts;
1965             opt_info->iv_to_split_tail = &ivts->next;
1966             continue;
1967           }
1968
1969         if (opt_info->insns_with_var_to_expand)
1970           ves = analyze_insn_to_expand_var (loop, insn);
1971
1972         if (ves)
1973           {
1974             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1975             gcc_assert (*slot2 == NULL);
1976             *slot2 = ves;
1977             *opt_info->var_to_expand_tail = ves;
1978             opt_info->var_to_expand_tail = &ves->next;
1979           }
1980       }
1981     }
1982
1983   edges.release ();
1984   free (body);
1985   return opt_info;
1986 }
1987
1988 /* Called just before loop duplication.  Records start of duplicated area
1989    to OPT_INFO.  */
1990
1991 static void
1992 opt_info_start_duplication (struct opt_info *opt_info)
1993 {
1994   if (opt_info)
1995     opt_info->first_new_block = last_basic_block;
1996 }
1997
1998 /* Determine the number of iterations between initialization of the base
1999    variable and the current copy (N_COPY).  N_COPIES is the total number
2000    of newly created copies.  UNROLLING is true if we are unrolling
2001    (not peeling) the loop.  */
2002
2003 static unsigned
2004 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2005 {
2006   if (unrolling)
2007     {
2008       /* If we are unrolling, initialization is done in the original loop
2009          body (number 0).  */
2010       return n_copy;
2011     }
2012   else
2013     {
2014       /* If we are peeling, the copy in that the initialization occurs has
2015          number 1.  The original loop (number 0) is the last.  */
2016       if (n_copy)
2017         return n_copy - 1;
2018       else
2019         return n_copies;
2020     }
2021 }
2022
2023 /* Locate in EXPR the expression corresponding to the location recorded
2024    in IVTS, and return a pointer to the RTX for this location.  */
2025
2026 static rtx *
2027 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2028 {
2029   unsigned i;
2030   rtx *ret = &expr;
2031
2032   for (i = 0; i < ivts->n_loc; i++)
2033     ret = &XEXP (*ret, ivts->loc[i]);
2034
2035   return ret;
2036 }
2037
2038 /* Allocate basic variable for the induction variable chain.  */
2039
2040 static void
2041 allocate_basic_variable (struct iv_to_split *ivts)
2042 {
2043   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2044
2045   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2046 }
2047
2048 /* Insert initialization of basic variable of IVTS before INSN, taking
2049    the initial value from INSN.  */
2050
2051 static void
2052 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2053 {
2054   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2055   rtx seq;
2056
2057   start_sequence ();
2058   expr = force_operand (expr, ivts->base_var);
2059   if (expr != ivts->base_var)
2060     emit_move_insn (ivts->base_var, expr);
2061   seq = get_insns ();
2062   end_sequence ();
2063
2064   emit_insn_before (seq, insn);
2065 }
2066
2067 /* Replace the use of induction variable described in IVTS in INSN
2068    by base variable + DELTA * step.  */
2069
2070 static void
2071 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2072 {
2073   rtx expr, *loc, seq, incr, var;
2074   enum machine_mode mode = GET_MODE (ivts->base_var);
2075   rtx src, dest, set;
2076
2077   /* Construct base + DELTA * step.  */
2078   if (!delta)
2079     expr = ivts->base_var;
2080   else
2081     {
2082       incr = simplify_gen_binary (MULT, mode,
2083                                   ivts->step, gen_int_mode (delta, mode));
2084       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2085                                   ivts->base_var, incr);
2086     }
2087
2088   /* Figure out where to do the replacement.  */
2089   loc = get_ivts_expr (single_set (insn), ivts);
2090
2091   /* If we can make the replacement right away, we're done.  */
2092   if (validate_change (insn, loc, expr, 0))
2093     return;
2094
2095   /* Otherwise, force EXPR into a register and try again.  */
2096   start_sequence ();
2097   var = gen_reg_rtx (mode);
2098   expr = force_operand (expr, var);
2099   if (expr != var)
2100     emit_move_insn (var, expr);
2101   seq = get_insns ();
2102   end_sequence ();
2103   emit_insn_before (seq, insn);
2104
2105   if (validate_change (insn, loc, var, 0))
2106     return;
2107
2108   /* The last chance.  Try recreating the assignment in insn
2109      completely from scratch.  */
2110   set = single_set (insn);
2111   gcc_assert (set);
2112
2113   start_sequence ();
2114   *loc = var;
2115   src = copy_rtx (SET_SRC (set));
2116   dest = copy_rtx (SET_DEST (set));
2117   src = force_operand (src, dest);
2118   if (src != dest)
2119     emit_move_insn (dest, src);
2120   seq = get_insns ();
2121   end_sequence ();
2122
2123   emit_insn_before (seq, insn);
2124   delete_insn (insn);
2125 }
2126
2127
2128 /* Return one expansion of the accumulator recorded in struct VE.  */
2129
2130 static rtx
2131 get_expansion (struct var_to_expand *ve)
2132 {
2133   rtx reg;
2134
2135   if (ve->reuse_expansion == 0)
2136     reg = ve->reg;
2137   else
2138     reg = ve->var_expansions[ve->reuse_expansion - 1];
2139
2140   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2141     ve->reuse_expansion = 0;
2142   else
2143     ve->reuse_expansion++;
2144
2145   return reg;
2146 }
2147
2148
2149 /* Given INSN replace the uses of the accumulator recorded in VE
2150    with a new register.  */
2151
2152 static void
2153 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2154 {
2155   rtx new_reg, set;
2156   bool really_new_expansion = false;
2157
2158   set = single_set (insn);
2159   gcc_assert (set);
2160
2161   /* Generate a new register only if the expansion limit has not been
2162      reached.  Else reuse an already existing expansion.  */
2163   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2164     {
2165       really_new_expansion = true;
2166       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2167     }
2168   else
2169     new_reg = get_expansion (ve);
2170
2171   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2172   if (apply_change_group ())
2173     if (really_new_expansion)
2174       {
2175         ve->var_expansions.safe_push (new_reg);
2176         ve->expansion_count++;
2177       }
2178 }
2179
2180 /* Initialize the variable expansions in loop preheader.  PLACE is the
2181    loop-preheader basic block where the initialization of the
2182    expansions should take place.  The expansions are initialized with
2183    (-0) when the operation is plus or minus to honor sign zero.  This
2184    way we can prevent cases where the sign of the final result is
2185    effected by the sign of the expansion.  Here is an example to
2186    demonstrate this:
2187
2188    for (i = 0 ; i < n; i++)
2189      sum += something;
2190
2191    ==>
2192
2193    sum += something
2194    ....
2195    i = i+1;
2196    sum1 += something
2197    ....
2198    i = i+1
2199    sum2 += something;
2200    ....
2201
2202    When SUM is initialized with -zero and SOMETHING is also -zero; the
2203    final result of sum should be -zero thus the expansions sum1 and sum2
2204    should be initialized with -zero as well (otherwise we will get +zero
2205    as the final result).  */
2206
2207 static void
2208 insert_var_expansion_initialization (struct var_to_expand *ve,
2209                                      basic_block place)
2210 {
2211   rtx seq, var, zero_init;
2212   unsigned i;
2213   enum machine_mode mode = GET_MODE (ve->reg);
2214   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2215
2216   if (ve->var_expansions.length () == 0)
2217     return;
2218
2219   start_sequence ();
2220   switch (ve->op)
2221     {
2222     case FMA:
2223       /* Note that we only accumulate FMA via the ADD operand.  */
2224     case PLUS:
2225     case MINUS:
2226       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2227         {
2228           if (honor_signed_zero_p)
2229             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2230           else
2231             zero_init = CONST0_RTX (mode);
2232           emit_move_insn (var, zero_init);
2233         }
2234       break;
2235
2236     case MULT:
2237       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2238         {
2239           zero_init = CONST1_RTX (GET_MODE (var));
2240           emit_move_insn (var, zero_init);
2241         }
2242       break;
2243
2244     default:
2245       gcc_unreachable ();
2246     }
2247
2248   seq = get_insns ();
2249   end_sequence ();
2250
2251   emit_insn_after (seq, BB_END (place));
2252 }
2253
2254 /* Combine the variable expansions at the loop exit.  PLACE is the
2255    loop exit basic block where the summation of the expansions should
2256    take place.  */
2257
2258 static void
2259 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2260 {
2261   rtx sum = ve->reg;
2262   rtx expr, seq, var, insn;
2263   unsigned i;
2264
2265   if (ve->var_expansions.length () == 0)
2266     return;
2267
2268   start_sequence ();
2269   switch (ve->op)
2270     {
2271     case FMA:
2272       /* Note that we only accumulate FMA via the ADD operand.  */
2273     case PLUS:
2274     case MINUS:
2275       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2276         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2277       break;
2278
2279     case MULT:
2280       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2281         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2282       break;
2283
2284     default:
2285       gcc_unreachable ();
2286     }
2287
2288   expr = force_operand (sum, ve->reg);
2289   if (expr != ve->reg)
2290     emit_move_insn (ve->reg, expr);
2291   seq = get_insns ();
2292   end_sequence ();
2293
2294   insn = BB_HEAD (place);
2295   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2296     insn = NEXT_INSN (insn);
2297
2298   emit_insn_after (seq, insn);
2299 }
2300
2301 /* Strip away REG_EQUAL notes for IVs we're splitting.
2302
2303    Updating REG_EQUAL notes for IVs we split is tricky: We
2304    cannot tell until after unrolling, DF-rescanning, and liveness
2305    updating, whether an EQ_USE is reached by the split IV while
2306    the IV reg is still live.  See PR55006.
2307
2308    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2309    because RTL loop-iv requires us to defer rescanning insns and
2310    any notes attached to them.  So resort to old techniques...  */
2311
2312 static void
2313 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2314 {
2315   struct iv_to_split *ivts;
2316   rtx note = find_reg_equal_equiv_note (insn);
2317   if (! note)
2318     return;
2319   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2320     if (reg_mentioned_p (ivts->orig_var, note))
2321       {
2322         remove_note (insn, note);
2323         return;
2324       }
2325 }
2326
2327 /* Apply loop optimizations in loop copies using the
2328    data which gathered during the unrolling.  Structure
2329    OPT_INFO record that data.
2330
2331    UNROLLING is true if we unrolled (not peeled) the loop.
2332    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2333    the loop (as it should happen in complete unrolling, but not in ordinary
2334    peeling of the loop).  */
2335
2336 static void
2337 apply_opt_in_copies (struct opt_info *opt_info,
2338                      unsigned n_copies, bool unrolling,
2339                      bool rewrite_original_loop)
2340 {
2341   unsigned i, delta;
2342   basic_block bb, orig_bb;
2343   rtx insn, orig_insn, next;
2344   struct iv_to_split ivts_templ, *ivts;
2345   struct var_to_expand ve_templ, *ves;
2346
2347   /* Sanity check -- we need to put initialization in the original loop
2348      body.  */
2349   gcc_assert (!unrolling || rewrite_original_loop);
2350
2351   /* Allocate the basic variables (i0).  */
2352   if (opt_info->insns_to_split)
2353     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2354       allocate_basic_variable (ivts);
2355
2356   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2357     {
2358       bb = BASIC_BLOCK (i);
2359       orig_bb = get_bb_original (bb);
2360
2361       /* bb->aux holds position in copy sequence initialized by
2362          duplicate_loop_to_header_edge.  */
2363       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2364                                         unrolling);
2365       bb->aux = 0;
2366       orig_insn = BB_HEAD (orig_bb);
2367       FOR_BB_INSNS_SAFE (bb, insn, next)
2368         {
2369           if (!INSN_P (insn)
2370               || (DEBUG_INSN_P (insn)
2371                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2372             continue;
2373
2374           while (!INSN_P (orig_insn)
2375                  || (DEBUG_INSN_P (orig_insn)
2376                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2377                          == LABEL_DECL)))
2378             orig_insn = NEXT_INSN (orig_insn);
2379
2380           ivts_templ.insn = orig_insn;
2381           ve_templ.insn = orig_insn;
2382
2383           /* Apply splitting iv optimization.  */
2384           if (opt_info->insns_to_split)
2385             {
2386               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2387
2388               ivts = (struct iv_to_split *)
2389                 htab_find (opt_info->insns_to_split, &ivts_templ);
2390
2391               if (ivts)
2392                 {
2393                   gcc_assert (GET_CODE (PATTERN (insn))
2394                               == GET_CODE (PATTERN (orig_insn)));
2395
2396                   if (!delta)
2397                     insert_base_initialization (ivts, insn);
2398                   split_iv (ivts, insn, delta);
2399                 }
2400             }
2401           /* Apply variable expansion optimization.  */
2402           if (unrolling && opt_info->insns_with_var_to_expand)
2403             {
2404               ves = (struct var_to_expand *)
2405                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2406               if (ves)
2407                 {
2408                   gcc_assert (GET_CODE (PATTERN (insn))
2409                               == GET_CODE (PATTERN (orig_insn)));
2410                   expand_var_during_unrolling (ves, insn);
2411                 }
2412             }
2413           orig_insn = NEXT_INSN (orig_insn);
2414         }
2415     }
2416
2417   if (!rewrite_original_loop)
2418     return;
2419
2420   /* Initialize the variable expansions in the loop preheader
2421      and take care of combining them at the loop exit.  */
2422   if (opt_info->insns_with_var_to_expand)
2423     {
2424       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2425         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2426       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2427         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2428     }
2429
2430   /* Rewrite also the original loop body.  Find them as originals of the blocks
2431      in the last copied iteration, i.e. those that have
2432      get_bb_copy (get_bb_original (bb)) == bb.  */
2433   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2434     {
2435       bb = BASIC_BLOCK (i);
2436       orig_bb = get_bb_original (bb);
2437       if (get_bb_copy (orig_bb) != bb)
2438         continue;
2439
2440       delta = determine_split_iv_delta (0, n_copies, unrolling);
2441       for (orig_insn = BB_HEAD (orig_bb);
2442            orig_insn != NEXT_INSN (BB_END (bb));
2443            orig_insn = next)
2444         {
2445           next = NEXT_INSN (orig_insn);
2446
2447           if (!INSN_P (orig_insn))
2448             continue;
2449
2450           ivts_templ.insn = orig_insn;
2451           if (opt_info->insns_to_split)
2452             {
2453               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2454
2455               ivts = (struct iv_to_split *)
2456                 htab_find (opt_info->insns_to_split, &ivts_templ);
2457               if (ivts)
2458                 {
2459                   if (!delta)
2460                     insert_base_initialization (ivts, orig_insn);
2461                   split_iv (ivts, orig_insn, delta);
2462                   continue;
2463                 }
2464             }
2465
2466         }
2467     }
2468 }
2469
2470 /* Release OPT_INFO.  */
2471
2472 static void
2473 free_opt_info (struct opt_info *opt_info)
2474 {
2475   if (opt_info->insns_to_split)
2476     htab_delete (opt_info->insns_to_split);
2477   if (opt_info->insns_with_var_to_expand)
2478     {
2479       struct var_to_expand *ves;
2480
2481       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2482         ves->var_expansions.release ();
2483       htab_delete (opt_info->insns_with_var_to_expand);
2484     }
2485   free (opt_info);
2486 }