gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Information about optimization applied in
 107    the unrolled loop.  */
 108
 109 struct opt_info
 110 {
 111   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 112   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 113   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 114   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 115                                       to expand.  */
 116   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 117   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 118   unsigned first_new_block;        /* The first basic block that was
 119                                       duplicated.  */
 120   basic_block loop_exit;           /* The loop exit basic block.  */
 121   basic_block loop_preheader;      /* The loop preheader basic block.  */
 122 };
 123
 124 static void decide_unrolling_and_peeling (int);
 125 static void peel_loops_completely (int);
 126 static void decide_peel_simple (struct loop *, int);
 127 static void decide_peel_once_rolling (struct loop *, int);
 128 static void decide_peel_completely (struct loop *, int);
 129 static void decide_unroll_stupid (struct loop *, int);
 130 static void decide_unroll_constant_iterations (struct loop *, int);
 131 static void decide_unroll_runtime_iterations (struct loop *, int);
 132 static void peel_loop_simple (struct loop *);
 133 static void peel_loop_completely (struct loop *);
 134 static void unroll_loop_stupid (struct loop *);
 135 static void unroll_loop_constant_iterations (struct loop *);
 136 static void unroll_loop_runtime_iterations (struct loop *);
 137 static struct opt_info *analyze_insns_in_loop (struct loop *);
 138 static void opt_info_start_duplication (struct opt_info *);
 139 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 140 static void free_opt_info (struct opt_info *);
 141 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 142 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 143 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 144 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 145 static void insert_var_expansion_initialization (struct var_to_expand *,
 146                                                  basic_block);
 147 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 148                                              basic_block);
 149 static rtx get_expansion (struct var_to_expand *);
 150
 151 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 152 void
 153 unroll_and_peel_loops (int flags)
 154 {
 155   struct loop *loop;
 156   bool check;
 157   loop_iterator li;
 158
 159   /* First perform complete loop peeling (it is almost surely a win,
 160      and affects parameters for further decision a lot).  */
 161   peel_loops_completely (flags);
 162
 163   /* Now decide rest of unrolling and peeling.  */
 164   decide_unrolling_and_peeling (flags);
 165
 166   /* Scan the loops, inner ones first.  */
 167   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 168     {
 169       check = true;
 170       /* And perform the appropriate transformations.  */
 171       switch (loop->lpt_decision.decision)
 172         {
 173         case LPT_PEEL_COMPLETELY:
 174           /* Already done.  */
 175           gcc_unreachable ();
 176         case LPT_PEEL_SIMPLE:
 177           peel_loop_simple (loop);
 178           break;
 179         case LPT_UNROLL_CONSTANT:
 180           unroll_loop_constant_iterations (loop);
 181           break;
 182         case LPT_UNROLL_RUNTIME:
 183           unroll_loop_runtime_iterations (loop);
 184           break;
 185         case LPT_UNROLL_STUPID:
 186           unroll_loop_stupid (loop);
 187           break;
 188         case LPT_NONE:
 189           check = false;
 190           break;
 191         default:
 192           gcc_unreachable ();
 193         }
 194       if (check)
 195         {
 196 #ifdef ENABLE_CHECKING
 197           verify_loop_structure ();
 198 #endif
 199         }
 200     }
 201
 202   iv_analysis_done ();
 203 }
 204
 205 /* Check whether exit of the LOOP is at the end of loop body.  */
 206
 207 static bool
 208 loop_exit_at_end_p (struct loop *loop)
 209 {
 210   struct niter_desc *desc = get_simple_loop_desc (loop);
 211   rtx insn;
 212
 213   if (desc->in_edge->dest != loop->latch)
 214     return false;
 215
 216   /* Check that the latch is empty.  */
 217   FOR_BB_INSNS (loop->latch, insn)
 218     {
 219       if (NONDEBUG_INSN_P (insn))
 220         return false;
 221     }
 222
 223   return true;
 224 }
 225
 226 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 227 static void
 228 peel_loops_completely (int flags)
 229 {
 230   struct loop *loop;
 231   loop_iterator li;
 232
 233   /* Scan the loops, the inner ones first.  */
 234   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 235     {
 236       loop->lpt_decision.decision = LPT_NONE;
 237
 238       if (dump_file)
 239         fprintf (dump_file,
 240                  "\n;; *** Considering loop %d for complete peeling ***\n",
 241                  loop->num);
 242
 243       loop->ninsns = num_loop_insns (loop);
 244
 245       decide_peel_once_rolling (loop, flags);
 246       if (loop->lpt_decision.decision == LPT_NONE)
 247         decide_peel_completely (loop, flags);
 248
 249       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 250         {
 251           peel_loop_completely (loop);
 252 #ifdef ENABLE_CHECKING
 253           verify_loop_structure ();
 254 #endif
 255         }
 256     }
 257 }
 258
 259 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 260 static void
 261 decide_unrolling_and_peeling (int flags)
 262 {
 263   struct loop *loop;
 264   loop_iterator li;
 265
 266   /* Scan the loops, inner ones first.  */
 267   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 268     {
 269       loop->lpt_decision.decision = LPT_NONE;
 270
 271       if (dump_file)
 272         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 273
 274       /* Do not peel cold areas.  */
 275       if (optimize_loop_for_size_p (loop))
 276         {
 277           if (dump_file)
 278             fprintf (dump_file, ";; Not considering loop, cold area\n");
 279           continue;
 280         }
 281
 282       /* Can the loop be manipulated?  */
 283       if (!can_duplicate_loop_p (loop))
 284         {
 285           if (dump_file)
 286             fprintf (dump_file,
 287                      ";; Not considering loop, cannot duplicate\n");
 288           continue;
 289         }
 290
 291       /* Skip non-innermost loops.  */
 292       if (loop->inner)
 293         {
 294           if (dump_file)
 295             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 296           continue;
 297         }
 298
 299       loop->ninsns = num_loop_insns (loop);
 300       loop->av_ninsns = average_num_loop_insns (loop);
 301
 302       /* Try transformations one by one in decreasing order of
 303          priority.  */
 304
 305       decide_unroll_constant_iterations (loop, flags);
 306       if (loop->lpt_decision.decision == LPT_NONE)
 307         decide_unroll_runtime_iterations (loop, flags);
 308       if (loop->lpt_decision.decision == LPT_NONE)
 309         decide_unroll_stupid (loop, flags);
 310       if (loop->lpt_decision.decision == LPT_NONE)
 311         decide_peel_simple (loop, flags);
 312     }
 313 }
 314
 315 /* Decide whether the LOOP is once rolling and suitable for complete
 316    peeling.  */
 317 static void
 318 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 319 {
 320   struct niter_desc *desc;
 321
 322   if (dump_file)
 323     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 324
 325   /* Is the loop small enough?  */
 326   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 327     {
 328       if (dump_file)
 329         fprintf (dump_file, ";; Not considering loop, is too big\n");
 330       return;
 331     }
 332
 333   /* Check for simple loops.  */
 334   desc = get_simple_loop_desc (loop);
 335
 336   /* Check number of iterations.  */
 337   if (!desc->simple_p
 338       || desc->assumptions
 339       || desc->infinite
 340       || !desc->const_iter
 341       || (desc->niter != 0
 342           && max_loop_iterations_int (loop) != 0))
 343     {
 344       if (dump_file)
 345         fprintf (dump_file,
 346                  ";; Unable to prove that the loop rolls exactly once\n");
 347       return;
 348     }
 349
 350   /* Success.  */
 351   if (dump_file)
 352     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 353   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 354 }
 355
 356 /* Decide whether the LOOP is suitable for complete peeling.  */
 357 static void
 358 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 359 {
 360   unsigned npeel;
 361   struct niter_desc *desc;
 362
 363   if (dump_file)
 364     fprintf (dump_file, "\n;; Considering peeling completely\n");
 365
 366   /* Skip non-innermost loops.  */
 367   if (loop->inner)
 368     {
 369       if (dump_file)
 370         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 371       return;
 372     }
 373
 374   /* Do not peel cold areas.  */
 375   if (optimize_loop_for_size_p (loop))
 376     {
 377       if (dump_file)
 378         fprintf (dump_file, ";; Not considering loop, cold area\n");
 379       return;
 380     }
 381
 382   /* Can the loop be manipulated?  */
 383   if (!can_duplicate_loop_p (loop))
 384     {
 385       if (dump_file)
 386         fprintf (dump_file,
 387                  ";; Not considering loop, cannot duplicate\n");
 388       return;
 389     }
 390
 391   /* npeel = number of iterations to peel.  */
 392   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 393   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 394     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 395
 396   /* Is the loop small enough?  */
 397   if (!npeel)
 398     {
 399       if (dump_file)
 400         fprintf (dump_file, ";; Not considering loop, is too big\n");
 401       return;
 402     }
 403
 404   /* Check for simple loops.  */
 405   desc = get_simple_loop_desc (loop);
 406
 407   /* Check number of iterations.  */
 408   if (!desc->simple_p
 409       || desc->assumptions
 410       || !desc->const_iter
 411       || desc->infinite)
 412     {
 413       if (dump_file)
 414         fprintf (dump_file,
 415                  ";; Unable to prove that the loop iterates constant times\n");
 416       return;
 417     }
 418
 419   if (desc->niter > npeel - 1)
 420     {
 421       if (dump_file)
 422         {
 423           fprintf (dump_file,
 424                    ";; Not peeling loop completely, rolls too much (");
 425           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 426           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 427         }
 428       return;
 429     }
 430
 431   /* Success.  */
 432   if (dump_file)
 433     fprintf (dump_file, ";; Decided to peel loop completely\n");
 434   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 435 }
 436
 437 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 438    completely.  The transformation done:
 439
 440    for (i = 0; i < 4; i++)
 441      body;
 442
 443    ==>
 444
 445    i = 0;
 446    body; i++;
 447    body; i++;
 448    body; i++;
 449    body; i++;
 450    */
 451 static void
 452 peel_loop_completely (struct loop *loop)
 453 {
 454   sbitmap wont_exit;
 455   unsigned HOST_WIDE_INT npeel;
 456   unsigned i;
 457   vec<edge> remove_edges;
 458   edge ein;
 459   struct niter_desc *desc = get_simple_loop_desc (loop);
 460   struct opt_info *opt_info = NULL;
 461
 462   npeel = desc->niter;
 463
 464   if (npeel)
 465     {
 466       bool ok;
 467
 468       wont_exit = sbitmap_alloc (npeel + 1);
 469       bitmap_ones (wont_exit);
 470       bitmap_clear_bit (wont_exit, 0);
 471       if (desc->noloop_assumptions)
 472         bitmap_clear_bit (wont_exit, 1);
 473
 474       remove_edges.create (0);
 475
 476       if (flag_split_ivs_in_unroller)
 477         opt_info = analyze_insns_in_loop (loop);
 478
 479       opt_info_start_duplication (opt_info);
 480       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 481                                           npeel,
 482                                           wont_exit, desc->out_edge,
 483                                           &remove_edges,
 484                                           DLTHE_FLAG_UPDATE_FREQ
 485                                           | DLTHE_FLAG_COMPLETTE_PEEL
 486                                           | (opt_info
 487                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 488       gcc_assert (ok);
 489
 490       free (wont_exit);
 491
 492       if (opt_info)
 493         {
 494           apply_opt_in_copies (opt_info, npeel, false, true);
 495           free_opt_info (opt_info);
 496         }
 497
 498       /* Remove the exit edges.  */
 499       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 500         remove_path (ein);
 501       remove_edges.release ();
 502     }
 503
 504   ein = desc->in_edge;
 505   free_simple_loop_desc (loop);
 506
 507   /* Now remove the unreachable part of the last iteration and cancel
 508      the loop.  */
 509   remove_path (ein);
 510
 511   if (dump_file)
 512     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 513 }
 514
 515 /* Decide whether to unroll LOOP iterating constant number of times
 516    and how much.  */
 517
 518 static void
 519 decide_unroll_constant_iterations (struct loop *loop, int flags)
 520 {
 521   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 522   struct niter_desc *desc;
 523   double_int iterations;
 524
 525   if (!(flags & UAP_UNROLL))
 526     {
 527       /* We were not asked to, just return back silently.  */
 528       return;
 529     }
 530
 531   if (dump_file)
 532     fprintf (dump_file,
 533              "\n;; Considering unrolling loop with constant "
 534              "number of iterations\n");
 535
 536   /* nunroll = total number of copies of the original loop body in
 537      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 538   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 539   nunroll_by_av
 540     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 541   if (nunroll > nunroll_by_av)
 542     nunroll = nunroll_by_av;
 543   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 544     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 545
 546   /* Skip big loops.  */
 547   if (nunroll <= 1)
 548     {
 549       if (dump_file)
 550         fprintf (dump_file, ";; Not considering loop, is too big\n");
 551       return;
 552     }
 553
 554   /* Check for simple loops.  */
 555   desc = get_simple_loop_desc (loop);
 556
 557   /* Check number of iterations.  */
 558   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 559     {
 560       if (dump_file)
 561         fprintf (dump_file,
 562                  ";; Unable to prove that the loop iterates constant times\n");
 563       return;
 564     }
 565
 566   /* Check whether the loop rolls enough to consider.
 567      Consult also loop bounds and profile; in the case the loop has more
 568      than one exit it may well loop less than determined maximal number
 569      of iterations.  */
 570   if (desc->niter < 2 * nunroll
 571       || ((estimated_loop_iterations (loop, &iterations)
 572            || max_loop_iterations (loop, &iterations))
 573           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 574     {
 575       if (dump_file)
 576         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 577       return;
 578     }
 579
 580   /* Success; now compute number of iterations to unroll.  We alter
 581      nunroll so that as few as possible copies of loop body are
 582      necessary, while still not decreasing the number of unrollings
 583      too much (at most by 1).  */
 584   best_copies = 2 * nunroll + 10;
 585
 586   i = 2 * nunroll + 2;
 587   if (i - 1 >= desc->niter)
 588     i = desc->niter - 2;
 589
 590   for (; i >= nunroll - 1; i--)
 591     {
 592       unsigned exit_mod = desc->niter % (i + 1);
 593
 594       if (!loop_exit_at_end_p (loop))
 595         n_copies = exit_mod + i + 1;
 596       else if (exit_mod != (unsigned) i
 597                || desc->noloop_assumptions != NULL_RTX)
 598         n_copies = exit_mod + i + 2;
 599       else
 600         n_copies = i + 1;
 601
 602       if (n_copies < best_copies)
 603         {
 604           best_copies = n_copies;
 605           best_unroll = i;
 606         }
 607     }
 608
 609   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 610   loop->lpt_decision.times = best_unroll;
 611
 612   if (dump_file)
 613     fprintf (dump_file, ";; Decided to unroll the loop %d times (%d copies).\n",
 614              loop->lpt_decision.times, best_copies);
 615 }
 616
 617 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 618    The transformation does this:
 619
 620    for (i = 0; i < 102; i++)
 621      body;
 622
 623    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 624
 625    i = 0;
 626    body; i++;
 627    body; i++;
 628    while (i < 102)
 629      {
 630        body; i++;
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634      }
 635   */
 636 static void
 637 unroll_loop_constant_iterations (struct loop *loop)
 638 {
 639   unsigned HOST_WIDE_INT niter;
 640   unsigned exit_mod;
 641   sbitmap wont_exit;
 642   unsigned i;
 643   vec<edge> remove_edges;
 644   edge e;
 645   unsigned max_unroll = loop->lpt_decision.times;
 646   struct niter_desc *desc = get_simple_loop_desc (loop);
 647   bool exit_at_end = loop_exit_at_end_p (loop);
 648   struct opt_info *opt_info = NULL;
 649   bool ok;
 650
 651   niter = desc->niter;
 652
 653   /* Should not get here (such loop should be peeled instead).  */
 654   gcc_assert (niter > max_unroll + 1);
 655
 656   exit_mod = niter % (max_unroll + 1);
 657
 658   wont_exit = sbitmap_alloc (max_unroll + 1);
 659   bitmap_ones (wont_exit);
 660
 661   remove_edges.create (0);
 662   if (flag_split_ivs_in_unroller
 663       || flag_variable_expansion_in_unroller)
 664     opt_info = analyze_insns_in_loop (loop);
 665
 666   if (!exit_at_end)
 667     {
 668       /* The exit is not at the end of the loop; leave exit test
 669          in the first copy, so that the loops that start with test
 670          of exit condition have continuous body after unrolling.  */
 671
 672       if (dump_file)
 673         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 674
 675       /* Peel exit_mod iterations.  */
 676       bitmap_clear_bit (wont_exit, 0);
 677       if (desc->noloop_assumptions)
 678         bitmap_clear_bit (wont_exit, 1);
 679
 680       if (exit_mod)
 681         {
 682           opt_info_start_duplication (opt_info);
 683           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 684                                               exit_mod,
 685                                               wont_exit, desc->out_edge,
 686                                               &remove_edges,
 687                                               DLTHE_FLAG_UPDATE_FREQ
 688                                               | (opt_info && exit_mod > 1
 689                                                  ? DLTHE_RECORD_COPY_NUMBER
 690                                                    : 0));
 691           gcc_assert (ok);
 692
 693           if (opt_info && exit_mod > 1)
 694             apply_opt_in_copies (opt_info, exit_mod, false, false);
 695
 696           desc->noloop_assumptions = NULL_RTX;
 697           desc->niter -= exit_mod;
 698           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 699           if (loop->any_estimate
 700               && double_int::from_uhwi (exit_mod).ule
 701                    (loop->nb_iterations_estimate))
 702             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 703           else
 704             loop->any_estimate = false;
 705         }
 706
 707       bitmap_set_bit (wont_exit, 1);
 708     }
 709   else
 710     {
 711       /* Leave exit test in last copy, for the same reason as above if
 712          the loop tests the condition at the end of loop body.  */
 713
 714       if (dump_file)
 715         fprintf (dump_file, ";; Condition at end of loop.\n");
 716
 717       /* We know that niter >= max_unroll + 2; so we do not need to care of
 718          case when we would exit before reaching the loop.  So just peel
 719          exit_mod + 1 iterations.  */
 720       if (exit_mod != max_unroll
 721           || desc->noloop_assumptions)
 722         {
 723           bitmap_clear_bit (wont_exit, 0);
 724           if (desc->noloop_assumptions)
 725             bitmap_clear_bit (wont_exit, 1);
 726
 727           opt_info_start_duplication (opt_info);
 728           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 729                                               exit_mod + 1,
 730                                               wont_exit, desc->out_edge,
 731                                               &remove_edges,
 732                                               DLTHE_FLAG_UPDATE_FREQ
 733                                               | (opt_info && exit_mod > 0
 734                                                  ? DLTHE_RECORD_COPY_NUMBER
 735                                                    : 0));
 736           gcc_assert (ok);
 737
 738           if (opt_info && exit_mod > 0)
 739             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 740
 741           desc->niter -= exit_mod + 1;
 742           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 743           if (loop->any_estimate
 744               && double_int::from_uhwi (exit_mod + 1).ule
 745                    (loop->nb_iterations_estimate))
 746             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 747           else
 748             loop->any_estimate = false;
 749           desc->noloop_assumptions = NULL_RTX;
 750
 751           bitmap_set_bit (wont_exit, 0);
 752           bitmap_set_bit (wont_exit, 1);
 753         }
 754
 755       bitmap_clear_bit (wont_exit, max_unroll);
 756     }
 757
 758   /* Now unroll the loop.  */
 759
 760   opt_info_start_duplication (opt_info);
 761   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 762                                       max_unroll,
 763                                       wont_exit, desc->out_edge,
 764                                       &remove_edges,
 765                                       DLTHE_FLAG_UPDATE_FREQ
 766                                       | (opt_info
 767                                          ? DLTHE_RECORD_COPY_NUMBER
 768                                            : 0));
 769   gcc_assert (ok);
 770
 771   if (opt_info)
 772     {
 773       apply_opt_in_copies (opt_info, max_unroll, true, true);
 774       free_opt_info (opt_info);
 775     }
 776
 777   free (wont_exit);
 778
 779   if (exit_at_end)
 780     {
 781       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 782       /* Find a new in and out edge; they are in the last copy we have made.  */
 783
 784       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 785         {
 786           desc->out_edge = EDGE_SUCC (exit_block, 0);
 787           desc->in_edge = EDGE_SUCC (exit_block, 1);
 788         }
 789       else
 790         {
 791           desc->out_edge = EDGE_SUCC (exit_block, 1);
 792           desc->in_edge = EDGE_SUCC (exit_block, 0);
 793         }
 794     }
 795
 796   desc->niter /= max_unroll + 1;
 797   loop->nb_iterations_upper_bound
 798     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 799                                                                    + 1),
 800                                             TRUNC_DIV_EXPR);
 801   if (loop->any_estimate)
 802     loop->nb_iterations_estimate
 803       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 804                                                                   + 1),
 805                                            TRUNC_DIV_EXPR);
 806   desc->niter_expr = GEN_INT (desc->niter);
 807
 808   /* Remove the edges.  */
 809   FOR_EACH_VEC_ELT (remove_edges, i, e)
 810     remove_path (e);
 811   remove_edges.release ();
 812
 813   if (dump_file)
 814     fprintf (dump_file,
 815              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 816              max_unroll, num_loop_insns (loop));
 817 }
 818
 819 /* Decide whether to unroll LOOP iterating runtime computable number of times
 820    and how much.  */
 821 static void
 822 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 823 {
 824   unsigned nunroll, nunroll_by_av, i;
 825   struct niter_desc *desc;
 826   double_int iterations;
 827
 828   if (!(flags & UAP_UNROLL))
 829     {
 830       /* We were not asked to, just return back silently.  */
 831       return;
 832     }
 833
 834   if (dump_file)
 835     fprintf (dump_file,
 836              "\n;; Considering unrolling loop with runtime "
 837              "computable number of iterations\n");
 838
 839   /* nunroll = total number of copies of the original loop body in
 840      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 841   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 842   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 843   if (nunroll > nunroll_by_av)
 844     nunroll = nunroll_by_av;
 845   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 846     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 847
 848   if (targetm.loop_unroll_adjust)
 849     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 850
 851   /* Skip big loops.  */
 852   if (nunroll <= 1)
 853     {
 854       if (dump_file)
 855         fprintf (dump_file, ";; Not considering loop, is too big\n");
 856       return;
 857     }
 858
 859   /* Check for simple loops.  */
 860   desc = get_simple_loop_desc (loop);
 861
 862   /* Check simpleness.  */
 863   if (!desc->simple_p || desc->assumptions)
 864     {
 865       if (dump_file)
 866         fprintf (dump_file,
 867                  ";; Unable to prove that the number of iterations "
 868                  "can be counted in runtime\n");
 869       return;
 870     }
 871
 872   if (desc->const_iter)
 873     {
 874       if (dump_file)
 875         fprintf (dump_file, ";; Loop iterates constant times\n");
 876       return;
 877     }
 878
 879   /* Check whether the loop rolls.  */
 880   if ((estimated_loop_iterations (loop, &iterations)
 881        || max_loop_iterations (loop, &iterations))
 882       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 883     {
 884       if (dump_file)
 885         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 886       return;
 887     }
 888
 889   /* Success; now force nunroll to be power of 2, as we are unable to
 890      cope with overflows in computation of number of iterations.  */
 891   for (i = 1; 2 * i <= nunroll; i *= 2)
 892     continue;
 893
 894   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 895   loop->lpt_decision.times = i - 1;
 896
 897   if (dump_file)
 898     fprintf (dump_file, ";; Decided to unroll the loop %d times.\n",
 899              loop->lpt_decision.times);
 900 }
 901
 902 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 903    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 904    and NULL is returned instead.  */
 905
 906 basic_block
 907 split_edge_and_insert (edge e, rtx insns)
 908 {
 909   basic_block bb;
 910
 911   if (!insns)
 912     return NULL;
 913   bb = split_edge (e);
 914   emit_insn_after (insns, BB_END (bb));
 915
 916   /* ??? We used to assume that INSNS can contain control flow insns, and
 917      that we had to try to find sub basic blocks in BB to maintain a valid
 918      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 919      and call break_superblocks when going out of cfglayout mode.  But it
 920      turns out that this never happens; and that if it does ever happen,
 921      the TODO_verify_flow at the end of the RTL loop passes would fail.
 922
 923      There are two reasons why we expected we could have control flow insns
 924      in INSNS.  The first is when a comparison has to be done in parts, and
 925      the second is when the number of iterations is computed for loops with
 926      the number of iterations known at runtime.  In both cases, test cases
 927      to get control flow in INSNS appear to be impossible to construct:
 928
 929       * If do_compare_rtx_and_jump needs several branches to do comparison
 930         in a mode that needs comparison by parts, we cannot analyze the
 931         number of iterations of the loop, and we never get to unrolling it.
 932
 933       * The code in expand_divmod that was suspected to cause creation of
 934         branching code seems to be only accessed for signed division.  The
 935         divisions used by # of iterations analysis are always unsigned.
 936         Problems might arise on architectures that emits branching code
 937         for some operations that may appear in the unroller (especially
 938         for division), but we have no such architectures.
 939
 940      Considering all this, it was decided that we should for now assume
 941      that INSNS can in theory contain control flow insns, but in practice
 942      it never does.  So we don't handle the theoretical case, and should
 943      a real failure ever show up, we have a pretty good clue for how to
 944      fix it.  */
 945
 946   return bb;
 947 }
 948
 949 /* Unroll LOOP for which we are able to count number of iterations in runtime
 950    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
 951    extra care for case n < 0):
 952
 953    for (i = 0; i < n; i++)
 954      body;
 955
 956    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 957
 958    i = 0;
 959    mod = n % 4;
 960
 961    switch (mod)
 962      {
 963        case 3:
 964          body; i++;
 965        case 2:
 966          body; i++;
 967        case 1:
 968          body; i++;
 969        case 0: ;
 970      }
 971
 972    while (i < n)
 973      {
 974        body; i++;
 975        body; i++;
 976        body; i++;
 977        body; i++;
 978      }
 979    */
 980 static void
 981 unroll_loop_runtime_iterations (struct loop *loop)
 982 {
 983   rtx old_niter, niter, init_code, branch_code, tmp;
 984   unsigned i, j, p;
 985   basic_block preheader, *body, swtch, ezc_swtch;
 986   vec<basic_block> dom_bbs;
 987   sbitmap wont_exit;
 988   int may_exit_copy;
 989   unsigned n_peel;
 990   vec<edge> remove_edges;
 991   edge e;
 992   bool extra_zero_check, last_may_exit;
 993   unsigned max_unroll = loop->lpt_decision.times;
 994   struct niter_desc *desc = get_simple_loop_desc (loop);
 995   bool exit_at_end = loop_exit_at_end_p (loop);
 996   struct opt_info *opt_info = NULL;
 997   bool ok;
 998
 999   if (flag_split_ivs_in_unroller
1000       || flag_variable_expansion_in_unroller)
1001     opt_info = analyze_insns_in_loop (loop);
1002
1003   /* Remember blocks whose dominators will have to be updated.  */
1004   dom_bbs.create (0);
1005
1006   body = get_loop_body (loop);
1007   for (i = 0; i < loop->num_nodes; i++)
1008     {
1009       vec<basic_block> ldom;
1010       basic_block bb;
1011
1012       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1013       FOR_EACH_VEC_ELT (ldom, j, bb)
1014         if (!flow_bb_inside_loop_p (loop, bb))
1015           dom_bbs.safe_push (bb);
1016
1017       ldom.release ();
1018     }
1019   free (body);
1020
1021   if (!exit_at_end)
1022     {
1023       /* Leave exit in first copy (for explanation why see comment in
1024          unroll_loop_constant_iterations).  */
1025       may_exit_copy = 0;
1026       n_peel = max_unroll - 1;
1027       extra_zero_check = true;
1028       last_may_exit = false;
1029     }
1030   else
1031     {
1032       /* Leave exit in last copy (for explanation why see comment in
1033          unroll_loop_constant_iterations).  */
1034       may_exit_copy = max_unroll;
1035       n_peel = max_unroll;
1036       extra_zero_check = false;
1037       last_may_exit = true;
1038     }
1039
1040   /* Get expression for number of iterations.  */
1041   start_sequence ();
1042   old_niter = niter = gen_reg_rtx (desc->mode);
1043   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1044   if (tmp != niter)
1045     emit_move_insn (niter, tmp);
1046
1047   /* Count modulo by ANDing it with max_unroll; we use the fact that
1048      the number of unrollings is a power of two, and thus this is correct
1049      even if there is overflow in the computation.  */
1050   niter = expand_simple_binop (desc->mode, AND,
1051                                niter,
1052                                GEN_INT (max_unroll),
1053                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1054
1055   init_code = get_insns ();
1056   end_sequence ();
1057   unshare_all_rtl_in_chain (init_code);
1058
1059   /* Precondition the loop.  */
1060   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1061
1062   remove_edges.create (0);
1063
1064   wont_exit = sbitmap_alloc (max_unroll + 2);
1065
1066   /* Peel the first copy of loop body (almost always we must leave exit test
1067      here; the only exception is when we have extra zero check and the number
1068      of iterations is reliable.  Also record the place of (possible) extra
1069      zero check.  */
1070   bitmap_clear (wont_exit);
1071   if (extra_zero_check
1072       && !desc->noloop_assumptions)
1073     bitmap_set_bit (wont_exit, 1);
1074   ezc_swtch = loop_preheader_edge (loop)->src;
1075   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1076                                       1, wont_exit, desc->out_edge,
1077                                       &remove_edges,
1078                                       DLTHE_FLAG_UPDATE_FREQ);
1079   gcc_assert (ok);
1080
1081   /* Record the place where switch will be built for preconditioning.  */
1082   swtch = split_edge (loop_preheader_edge (loop));
1083
1084   for (i = 0; i < n_peel; i++)
1085     {
1086       /* Peel the copy.  */
1087       bitmap_clear (wont_exit);
1088       if (i != n_peel - 1 || !last_may_exit)
1089         bitmap_set_bit (wont_exit, 1);
1090       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1091                                           1, wont_exit, desc->out_edge,
1092                                           &remove_edges,
1093                                           DLTHE_FLAG_UPDATE_FREQ);
1094       gcc_assert (ok);
1095
1096       /* Create item for switch.  */
1097       j = n_peel - i - (extra_zero_check ? 0 : 1);
1098       p = REG_BR_PROB_BASE / (i + 2);
1099
1100       preheader = split_edge (loop_preheader_edge (loop));
1101       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1102                                           block_label (preheader), p,
1103                                           NULL_RTX);
1104
1105       /* We rely on the fact that the compare and jump cannot be optimized out,
1106          and hence the cfg we create is correct.  */
1107       gcc_assert (branch_code != NULL_RTX);
1108
1109       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1110       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1111       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1112       e = make_edge (swtch, preheader,
1113                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1114       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1115       e->probability = p;
1116     }
1117
1118   if (extra_zero_check)
1119     {
1120       /* Add branch for zero iterations.  */
1121       p = REG_BR_PROB_BASE / (max_unroll + 1);
1122       swtch = ezc_swtch;
1123       preheader = split_edge (loop_preheader_edge (loop));
1124       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1125                                           block_label (preheader), p,
1126                                           NULL_RTX);
1127       gcc_assert (branch_code != NULL_RTX);
1128
1129       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1130       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1131       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1132       e = make_edge (swtch, preheader,
1133                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1134       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1135       e->probability = p;
1136     }
1137
1138   /* Recount dominators for outer blocks.  */
1139   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1140
1141   /* And unroll loop.  */
1142
1143   bitmap_ones (wont_exit);
1144   bitmap_clear_bit (wont_exit, may_exit_copy);
1145   opt_info_start_duplication (opt_info);
1146
1147   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1148                                       max_unroll,
1149                                       wont_exit, desc->out_edge,
1150                                       &remove_edges,
1151                                       DLTHE_FLAG_UPDATE_FREQ
1152                                       | (opt_info
1153                                          ? DLTHE_RECORD_COPY_NUMBER
1154                                            : 0));
1155   gcc_assert (ok);
1156
1157   if (opt_info)
1158     {
1159       apply_opt_in_copies (opt_info, max_unroll, true, true);
1160       free_opt_info (opt_info);
1161     }
1162
1163   free (wont_exit);
1164
1165   if (exit_at_end)
1166     {
1167       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1168       /* Find a new in and out edge; they are in the last copy we have
1169          made.  */
1170
1171       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1172         {
1173           desc->out_edge = EDGE_SUCC (exit_block, 0);
1174           desc->in_edge = EDGE_SUCC (exit_block, 1);
1175         }
1176       else
1177         {
1178           desc->out_edge = EDGE_SUCC (exit_block, 1);
1179           desc->in_edge = EDGE_SUCC (exit_block, 0);
1180         }
1181     }
1182
1183   /* Remove the edges.  */
1184   FOR_EACH_VEC_ELT (remove_edges, i, e)
1185     remove_path (e);
1186   remove_edges.release ();
1187
1188   /* We must be careful when updating the number of iterations due to
1189      preconditioning and the fact that the value must be valid at entry
1190      of the loop.  After passing through the above code, we see that
1191      the correct new number of iterations is this:  */
1192   gcc_assert (!desc->const_iter);
1193   desc->niter_expr =
1194     simplify_gen_binary (UDIV, desc->mode, old_niter,
1195                          GEN_INT (max_unroll + 1));
1196   loop->nb_iterations_upper_bound
1197     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1198                                                                    + 1),
1199                                             TRUNC_DIV_EXPR);
1200   if (loop->any_estimate)
1201     loop->nb_iterations_estimate
1202       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1203                                                                   + 1),
1204                                            TRUNC_DIV_EXPR);
1205   if (exit_at_end)
1206     {
1207       desc->niter_expr =
1208         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1209       desc->noloop_assumptions = NULL_RTX;
1210       --loop->nb_iterations_upper_bound;
1211       if (loop->any_estimate
1212           && loop->nb_iterations_estimate != double_int_zero)
1213         --loop->nb_iterations_estimate;
1214       else
1215         loop->any_estimate = false;
1216     }
1217
1218   if (dump_file)
1219     fprintf (dump_file,
1220              ";; Unrolled loop %d times, counting # of iterations "
1221              "in runtime, %i insns\n",
1222              max_unroll, num_loop_insns (loop));
1223
1224   dom_bbs.release ();
1225 }
1226
1227 /* Decide whether to simply peel LOOP and how much.  */
1228 static void
1229 decide_peel_simple (struct loop *loop, int flags)
1230 {
1231   unsigned npeel;
1232   double_int iterations;
1233
1234   if (!(flags & UAP_PEEL))
1235     {
1236       /* We were not asked to, just return back silently.  */
1237       return;
1238     }
1239
1240   if (dump_file)
1241     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1242
1243   /* npeel = number of iterations to peel.  */
1244   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1245   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1246     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1247
1248   /* Skip big loops.  */
1249   if (!npeel)
1250     {
1251       if (dump_file)
1252         fprintf (dump_file, ";; Not considering loop, is too big\n");
1253       return;
1254     }
1255
1256   /* Do not simply peel loops with branches inside -- it increases number
1257      of mispredicts.
1258      Exception is when we do have profile and we however have good chance
1259      to peel proper number of iterations loop will iterate in practice.
1260      TODO: this heuristic needs tunning; while for complette unrolling
1261      the branch inside loop mostly eliminates any improvements, for
1262      peeling it is not the case.  Also a function call inside loop is
1263      also branch from branch prediction POV (and probably better reason
1264      to not unroll/peel).  */
1265   if (num_loop_branches (loop) > 1
1266       && profile_status != PROFILE_READ)
1267     {
1268       if (dump_file)
1269         fprintf (dump_file, ";; Not peeling, contains branches\n");
1270       return;
1271     }
1272
1273   /* If we have realistic estimate on number of iterations, use it.  */
1274   if (estimated_loop_iterations (loop, &iterations))
1275     {
1276       if (double_int::from_shwi (npeel).ule (iterations))
1277         {
1278           if (dump_file)
1279             {
1280               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1281               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1282                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1283               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1284                        npeel);
1285             }
1286           return;
1287         }
1288       npeel = iterations.to_shwi () + 1;
1289     }
1290   /* If we have small enough bound on iterations, we can still peel (completely
1291      unroll).  */
1292   else if (max_loop_iterations (loop, &iterations)
1293            && iterations.ult (double_int::from_shwi (npeel)))
1294     npeel = iterations.to_shwi () + 1;
1295   else
1296     {
1297       /* For now we have no good heuristics to decide whether loop peeling
1298          will be effective, so disable it.  */
1299       if (dump_file)
1300         fprintf (dump_file,
1301                  ";; Not peeling loop, no evidence it will be profitable\n");
1302       return;
1303     }
1304
1305   /* Success.  */
1306   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1307   loop->lpt_decision.times = npeel;
1308
1309   if (dump_file)
1310     fprintf (dump_file, ";; Decided to simply peel the loop %d times.\n",
1311              loop->lpt_decision.times);
1312 }
1313
1314 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1315
1316    while (cond)
1317      body;
1318
1319    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1320
1321    if (!cond) goto end;
1322    body;
1323    if (!cond) goto end;
1324    body;
1325    if (!cond) goto end;
1326    body;
1327    while (cond)
1328      body;
1329    end: ;
1330    */
1331 static void
1332 peel_loop_simple (struct loop *loop)
1333 {
1334   sbitmap wont_exit;
1335   unsigned npeel = loop->lpt_decision.times;
1336   struct niter_desc *desc = get_simple_loop_desc (loop);
1337   struct opt_info *opt_info = NULL;
1338   bool ok;
1339
1340   if (flag_split_ivs_in_unroller && npeel > 1)
1341     opt_info = analyze_insns_in_loop (loop);
1342
1343   wont_exit = sbitmap_alloc (npeel + 1);
1344   bitmap_clear (wont_exit);
1345
1346   opt_info_start_duplication (opt_info);
1347
1348   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1349                                       npeel, wont_exit, NULL,
1350                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1351                                       | (opt_info
1352                                          ? DLTHE_RECORD_COPY_NUMBER
1353                                            : 0));
1354   gcc_assert (ok);
1355
1356   free (wont_exit);
1357
1358   if (opt_info)
1359     {
1360       apply_opt_in_copies (opt_info, npeel, false, false);
1361       free_opt_info (opt_info);
1362     }
1363
1364   if (desc->simple_p)
1365     {
1366       if (desc->const_iter)
1367         {
1368           desc->niter -= npeel;
1369           desc->niter_expr = GEN_INT (desc->niter);
1370           desc->noloop_assumptions = NULL_RTX;
1371         }
1372       else
1373         {
1374           /* We cannot just update niter_expr, as its value might be clobbered
1375              inside loop.  We could handle this by counting the number into
1376              temporary just like we do in runtime unrolling, but it does not
1377              seem worthwhile.  */
1378           free_simple_loop_desc (loop);
1379         }
1380     }
1381   if (dump_file)
1382     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1383 }
1384
1385 /* Decide whether to unroll LOOP stupidly and how much.  */
1386 static void
1387 decide_unroll_stupid (struct loop *loop, int flags)
1388 {
1389   unsigned nunroll, nunroll_by_av, i;
1390   struct niter_desc *desc;
1391   double_int iterations;
1392
1393   if (!(flags & UAP_UNROLL_ALL))
1394     {
1395       /* We were not asked to, just return back silently.  */
1396       return;
1397     }
1398
1399   if (dump_file)
1400     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1401
1402   /* nunroll = total number of copies of the original loop body in
1403      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1404   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1405   nunroll_by_av
1406     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1407   if (nunroll > nunroll_by_av)
1408     nunroll = nunroll_by_av;
1409   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1410     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1411
1412   if (targetm.loop_unroll_adjust)
1413     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1414
1415   /* Skip big loops.  */
1416   if (nunroll <= 1)
1417     {
1418       if (dump_file)
1419         fprintf (dump_file, ";; Not considering loop, is too big\n");
1420       return;
1421     }
1422
1423   /* Check for simple loops.  */
1424   desc = get_simple_loop_desc (loop);
1425
1426   /* Check simpleness.  */
1427   if (desc->simple_p && !desc->assumptions)
1428     {
1429       if (dump_file)
1430         fprintf (dump_file, ";; The loop is simple\n");
1431       return;
1432     }
1433
1434   /* Do not unroll loops with branches inside -- it increases number
1435      of mispredicts.
1436      TODO: this heuristic needs tunning; call inside the loop body
1437      is also relatively good reason to not unroll.  */
1438   if (num_loop_branches (loop) > 1)
1439     {
1440       if (dump_file)
1441         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1442       return;
1443     }
1444
1445   /* Check whether the loop rolls.  */
1446   if ((estimated_loop_iterations (loop, &iterations)
1447        || max_loop_iterations (loop, &iterations))
1448       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1449     {
1450       if (dump_file)
1451         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1452       return;
1453     }
1454
1455   /* Success.  Now force nunroll to be power of 2, as it seems that this
1456      improves results (partially because of better alignments, partially
1457      because of some dark magic).  */
1458   for (i = 1; 2 * i <= nunroll; i *= 2)
1459     continue;
1460
1461   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1462   loop->lpt_decision.times = i - 1;
1463
1464   if (dump_file)
1465     fprintf (dump_file, ";; Decided to unroll the loop stupidly %d times.\n",
1466              loop->lpt_decision.times);
1467 }
1468
1469 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1470
1471    while (cond)
1472      body;
1473
1474    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1475
1476    while (cond)
1477      {
1478        body;
1479        if (!cond) break;
1480        body;
1481        if (!cond) break;
1482        body;
1483        if (!cond) break;
1484        body;
1485      }
1486    */
1487 static void
1488 unroll_loop_stupid (struct loop *loop)
1489 {
1490   sbitmap wont_exit;
1491   unsigned nunroll = loop->lpt_decision.times;
1492   struct niter_desc *desc = get_simple_loop_desc (loop);
1493   struct opt_info *opt_info = NULL;
1494   bool ok;
1495
1496   if (flag_split_ivs_in_unroller
1497       || flag_variable_expansion_in_unroller)
1498     opt_info = analyze_insns_in_loop (loop);
1499
1500
1501   wont_exit = sbitmap_alloc (nunroll + 1);
1502   bitmap_clear (wont_exit);
1503   opt_info_start_duplication (opt_info);
1504
1505   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1506                                       nunroll, wont_exit,
1507                                       NULL, NULL,
1508                                       DLTHE_FLAG_UPDATE_FREQ
1509                                       | (opt_info
1510                                          ? DLTHE_RECORD_COPY_NUMBER
1511                                            : 0));
1512   gcc_assert (ok);
1513
1514   if (opt_info)
1515     {
1516       apply_opt_in_copies (opt_info, nunroll, true, true);
1517       free_opt_info (opt_info);
1518     }
1519
1520   free (wont_exit);
1521
1522   if (desc->simple_p)
1523     {
1524       /* We indeed may get here provided that there are nontrivial assumptions
1525          for a loop to be really simple.  We could update the counts, but the
1526          problem is that we are unable to decide which exit will be taken
1527          (not really true in case the number of iterations is constant,
1528          but noone will do anything with this information, so we do not
1529          worry about it).  */
1530       desc->simple_p = false;
1531     }
1532
1533   if (dump_file)
1534     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1535              nunroll, num_loop_insns (loop));
1536 }
1537
1538 /* A hash function for information about insns to split.  */
1539
1540 static hashval_t
1541 si_info_hash (const void *ivts)
1542 {
1543   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1544 }
1545
1546 /* An equality functions for information about insns to split.  */
1547
1548 static int
1549 si_info_eq (const void *ivts1, const void *ivts2)
1550 {
1551   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1552   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1553
1554   return i1->insn == i2->insn;
1555 }
1556
1557 /* Return a hash for VES, which is really a "var_to_expand *".  */
1558
1559 static hashval_t
1560 ve_info_hash (const void *ves)
1561 {
1562   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1563 }
1564
1565 /* Return true if IVTS1 and IVTS2 (which are really both of type
1566    "var_to_expand *") refer to the same instruction.  */
1567
1568 static int
1569 ve_info_eq (const void *ivts1, const void *ivts2)
1570 {
1571   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1572   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1573
1574   return i1->insn == i2->insn;
1575 }
1576
1577 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1578    Set *DEBUG_USES to the number of debug insns that reference the
1579    variable.  */
1580
1581 bool
1582 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1583                                   int *debug_uses)
1584 {
1585   basic_block *body, bb;
1586   unsigned i;
1587   int count_ref = 0;
1588   rtx insn;
1589
1590   body = get_loop_body (loop);
1591   for (i = 0; i < loop->num_nodes; i++)
1592     {
1593       bb = body[i];
1594
1595       FOR_BB_INSNS (bb, insn)
1596         if (!rtx_referenced_p (reg, insn))
1597           continue;
1598         else if (DEBUG_INSN_P (insn))
1599           ++*debug_uses;
1600         else if (++count_ref > 1)
1601           break;
1602     }
1603   free (body);
1604   return (count_ref  == 1);
1605 }
1606
1607 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1608
1609 static void
1610 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1611 {
1612   basic_block *body, bb;
1613   unsigned i;
1614   rtx insn;
1615
1616   body = get_loop_body (loop);
1617   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1618     {
1619       bb = body[i];
1620
1621       FOR_BB_INSNS (bb, insn)
1622         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1623           continue;
1624         else
1625           {
1626             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1627                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1628             if (!--debug_uses)
1629               break;
1630           }
1631     }
1632   free (body);
1633 }
1634
1635 /* Determine whether INSN contains an accumulator
1636    which can be expanded into separate copies,
1637    one for each copy of the LOOP body.
1638
1639    for (i = 0 ; i < n; i++)
1640      sum += a[i];
1641
1642    ==>
1643
1644    sum += a[i]
1645    ....
1646    i = i+1;
1647    sum1 += a[i]
1648    ....
1649    i = i+1
1650    sum2 += a[i];
1651    ....
1652
1653    Return NULL if INSN contains no opportunity for expansion of accumulator.
1654    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1655    information and return a pointer to it.
1656 */
1657
1658 static struct var_to_expand *
1659 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1660 {
1661   rtx set, dest, src;
1662   struct var_to_expand *ves;
1663   unsigned accum_pos;
1664   enum rtx_code code;
1665   int debug_uses = 0;
1666
1667   set = single_set (insn);
1668   if (!set)
1669     return NULL;
1670
1671   dest = SET_DEST (set);
1672   src = SET_SRC (set);
1673   code = GET_CODE (src);
1674
1675   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1676     return NULL;
1677
1678   if (FLOAT_MODE_P (GET_MODE (dest)))
1679     {
1680       if (!flag_associative_math)
1681         return NULL;
1682       /* In the case of FMA, we're also changing the rounding.  */
1683       if (code == FMA && !flag_unsafe_math_optimizations)
1684         return NULL;
1685     }
1686
1687   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1688      in MD.  But if there is no optab to generate the insn, we can not
1689      perform the variable expansion.  This can happen if an MD provides
1690      an insn but not a named pattern to generate it, for example to avoid
1691      producing code that needs additional mode switches like for x87/mmx.
1692
1693      So we check have_insn_for which looks for an optab for the operation
1694      in SRC.  If it doesn't exist, we can't perform the expansion even
1695      though INSN is valid.  */
1696   if (!have_insn_for (code, GET_MODE (src)))
1697     return NULL;
1698
1699   if (!REG_P (dest)
1700       && !(GET_CODE (dest) == SUBREG
1701            && REG_P (SUBREG_REG (dest))))
1702     return NULL;
1703
1704   /* Find the accumulator use within the operation.  */
1705   if (code == FMA)
1706     {
1707       /* We only support accumulation via FMA in the ADD position.  */
1708       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1709         return NULL;
1710       accum_pos = 2;
1711     }
1712   else if (rtx_equal_p (dest, XEXP (src, 0)))
1713     accum_pos = 0;
1714   else if (rtx_equal_p (dest, XEXP (src, 1)))
1715     {
1716       /* The method of expansion that we are using; which includes the
1717          initialization of the expansions with zero and the summation of
1718          the expansions at the end of the computation will yield wrong
1719          results for (x = something - x) thus avoid using it in that case.  */
1720       if (code == MINUS)
1721         return NULL;
1722       accum_pos = 1;
1723     }
1724   else
1725     return NULL;
1726
1727   /* It must not otherwise be used.  */
1728   if (code == FMA)
1729     {
1730       if (rtx_referenced_p (dest, XEXP (src, 0))
1731           || rtx_referenced_p (dest, XEXP (src, 1)))
1732         return NULL;
1733     }
1734   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1735     return NULL;
1736
1737   /* It must be used in exactly one insn.  */
1738   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1739     return NULL;
1740
1741   if (dump_file)
1742     {
1743       fprintf (dump_file, "\n;; Expanding Accumulator ");
1744       print_rtl (dump_file, dest);
1745       fprintf (dump_file, "\n");
1746     }
1747
1748   if (debug_uses)
1749     /* Instead of resetting the debug insns, we could replace each
1750        debug use in the loop with the sum or product of all expanded
1751        accummulators.  Since we'll only know of all expansions at the
1752        end, we'd have to keep track of which vars_to_expand a debug
1753        insn in the loop references, take note of each copy of the
1754        debug insn during unrolling, and when it's all done, compute
1755        the sum or product of each variable and adjust the original
1756        debug insn and each copy thereof.  What a pain!  */
1757     reset_debug_uses_in_loop (loop, dest, debug_uses);
1758
1759   /* Record the accumulator to expand.  */
1760   ves = XNEW (struct var_to_expand);
1761   ves->insn = insn;
1762   ves->reg = copy_rtx (dest);
1763   ves->var_expansions.create (1);
1764   ves->next = NULL;
1765   ves->op = GET_CODE (src);
1766   ves->expansion_count = 0;
1767   ves->reuse_expansion = 0;
1768   return ves;
1769 }
1770
1771 /* Determine whether there is an induction variable in INSN that
1772    we would like to split during unrolling.
1773
1774    I.e. replace
1775
1776    i = i + 1;
1777    ...
1778    i = i + 1;
1779    ...
1780    i = i + 1;
1781    ...
1782
1783    type chains by
1784
1785    i0 = i + 1
1786    ...
1787    i = i0 + 1
1788    ...
1789    i = i0 + 2
1790    ...
1791
1792    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1793    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1794    pointer to it.  */
1795
1796 static struct iv_to_split *
1797 analyze_iv_to_split_insn (rtx insn)
1798 {
1799   rtx set, dest;
1800   struct rtx_iv iv;
1801   struct iv_to_split *ivts;
1802   bool ok;
1803
1804   /* For now we just split the basic induction variables.  Later this may be
1805      extended for example by selecting also addresses of memory references.  */
1806   set = single_set (insn);
1807   if (!set)
1808     return NULL;
1809
1810   dest = SET_DEST (set);
1811   if (!REG_P (dest))
1812     return NULL;
1813
1814   if (!biv_p (insn, dest))
1815     return NULL;
1816
1817   ok = iv_analyze_result (insn, dest, &iv);
1818
1819   /* This used to be an assert under the assumption that if biv_p returns
1820      true that iv_analyze_result must also return true.  However, that
1821      assumption is not strictly correct as evidenced by pr25569.
1822
1823      Returning NULL when iv_analyze_result returns false is safe and
1824      avoids the problems in pr25569 until the iv_analyze_* routines
1825      can be fixed, which is apparently hard and time consuming
1826      according to their author.  */
1827   if (! ok)
1828     return NULL;
1829
1830   if (iv.step == const0_rtx
1831       || iv.mode != iv.extend_mode)
1832     return NULL;
1833
1834   /* Record the insn to split.  */
1835   ivts = XNEW (struct iv_to_split);
1836   ivts->insn = insn;
1837   ivts->orig_var = dest;
1838   ivts->base_var = NULL_RTX;
1839   ivts->step = iv.step;
1840   ivts->next = NULL;
1841   ivts->n_loc = 1;
1842   ivts->loc[0] = 1;
1843
1844   return ivts;
1845 }
1846
1847 /* Determines which of insns in LOOP can be optimized.
1848    Return a OPT_INFO struct with the relevant hash tables filled
1849    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1850    is undefined for the return value.  */
1851
1852 static struct opt_info *
1853 analyze_insns_in_loop (struct loop *loop)
1854 {
1855   basic_block *body, bb;
1856   unsigned i;
1857   struct opt_info *opt_info = XCNEW (struct opt_info);
1858   rtx insn;
1859   struct iv_to_split *ivts = NULL;
1860   struct var_to_expand *ves = NULL;
1861   PTR *slot1;
1862   PTR *slot2;
1863   vec<edge> edges = get_loop_exit_edges (loop);
1864   edge exit;
1865   bool can_apply = false;
1866
1867   iv_analysis_loop_init (loop);
1868
1869   body = get_loop_body (loop);
1870
1871   if (flag_split_ivs_in_unroller)
1872     {
1873       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1874                                               si_info_hash, si_info_eq, free);
1875       opt_info->iv_to_split_head = NULL;
1876       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1877     }
1878
1879   /* Record the loop exit bb and loop preheader before the unrolling.  */
1880   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1881
1882   if (edges.length () == 1)
1883     {
1884       exit = edges[0];
1885       if (!(exit->flags & EDGE_COMPLEX))
1886         {
1887           opt_info->loop_exit = split_edge (exit);
1888           can_apply = true;
1889         }
1890     }
1891
1892   if (flag_variable_expansion_in_unroller
1893       && can_apply)
1894     {
1895       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1896                                                         ve_info_hash,
1897                                                         ve_info_eq, free);
1898       opt_info->var_to_expand_head = NULL;
1899       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1900     }
1901
1902   for (i = 0; i < loop->num_nodes; i++)
1903     {
1904       bb = body[i];
1905       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1906         continue;
1907
1908       FOR_BB_INSNS (bb, insn)
1909       {
1910         if (!INSN_P (insn))
1911           continue;
1912
1913         if (opt_info->insns_to_split)
1914           ivts = analyze_iv_to_split_insn (insn);
1915
1916         if (ivts)
1917           {
1918             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1919             gcc_assert (*slot1 == NULL);
1920             *slot1 = ivts;
1921             *opt_info->iv_to_split_tail = ivts;
1922             opt_info->iv_to_split_tail = &ivts->next;
1923             continue;
1924           }
1925
1926         if (opt_info->insns_with_var_to_expand)
1927           ves = analyze_insn_to_expand_var (loop, insn);
1928
1929         if (ves)
1930           {
1931             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1932             gcc_assert (*slot2 == NULL);
1933             *slot2 = ves;
1934             *opt_info->var_to_expand_tail = ves;
1935             opt_info->var_to_expand_tail = &ves->next;
1936           }
1937       }
1938     }
1939
1940   edges.release ();
1941   free (body);
1942   return opt_info;
1943 }
1944
1945 /* Called just before loop duplication.  Records start of duplicated area
1946    to OPT_INFO.  */
1947
1948 static void
1949 opt_info_start_duplication (struct opt_info *opt_info)
1950 {
1951   if (opt_info)
1952     opt_info->first_new_block = last_basic_block;
1953 }
1954
1955 /* Determine the number of iterations between initialization of the base
1956    variable and the current copy (N_COPY).  N_COPIES is the total number
1957    of newly created copies.  UNROLLING is true if we are unrolling
1958    (not peeling) the loop.  */
1959
1960 static unsigned
1961 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1962 {
1963   if (unrolling)
1964     {
1965       /* If we are unrolling, initialization is done in the original loop
1966          body (number 0).  */
1967       return n_copy;
1968     }
1969   else
1970     {
1971       /* If we are peeling, the copy in that the initialization occurs has
1972          number 1.  The original loop (number 0) is the last.  */
1973       if (n_copy)
1974         return n_copy - 1;
1975       else
1976         return n_copies;
1977     }
1978 }
1979
1980 /* Locate in EXPR the expression corresponding to the location recorded
1981    in IVTS, and return a pointer to the RTX for this location.  */
1982
1983 static rtx *
1984 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1985 {
1986   unsigned i;
1987   rtx *ret = &expr;
1988
1989   for (i = 0; i < ivts->n_loc; i++)
1990     ret = &XEXP (*ret, ivts->loc[i]);
1991
1992   return ret;
1993 }
1994
1995 /* Allocate basic variable for the induction variable chain.  */
1996
1997 static void
1998 allocate_basic_variable (struct iv_to_split *ivts)
1999 {
2000   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2001
2002   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2003 }
2004
2005 /* Insert initialization of basic variable of IVTS before INSN, taking
2006    the initial value from INSN.  */
2007
2008 static void
2009 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2010 {
2011   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2012   rtx seq;
2013
2014   start_sequence ();
2015   expr = force_operand (expr, ivts->base_var);
2016   if (expr != ivts->base_var)
2017     emit_move_insn (ivts->base_var, expr);
2018   seq = get_insns ();
2019   end_sequence ();
2020
2021   emit_insn_before (seq, insn);
2022 }
2023
2024 /* Replace the use of induction variable described in IVTS in INSN
2025    by base variable + DELTA * step.  */
2026
2027 static void
2028 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2029 {
2030   rtx expr, *loc, seq, incr, var;
2031   enum machine_mode mode = GET_MODE (ivts->base_var);
2032   rtx src, dest, set;
2033
2034   /* Construct base + DELTA * step.  */
2035   if (!delta)
2036     expr = ivts->base_var;
2037   else
2038     {
2039       incr = simplify_gen_binary (MULT, mode,
2040                                   ivts->step, gen_int_mode (delta, mode));
2041       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2042                                   ivts->base_var, incr);
2043     }
2044
2045   /* Figure out where to do the replacement.  */
2046   loc = get_ivts_expr (single_set (insn), ivts);
2047
2048   /* If we can make the replacement right away, we're done.  */
2049   if (validate_change (insn, loc, expr, 0))
2050     return;
2051
2052   /* Otherwise, force EXPR into a register and try again.  */
2053   start_sequence ();
2054   var = gen_reg_rtx (mode);
2055   expr = force_operand (expr, var);
2056   if (expr != var)
2057     emit_move_insn (var, expr);
2058   seq = get_insns ();
2059   end_sequence ();
2060   emit_insn_before (seq, insn);
2061
2062   if (validate_change (insn, loc, var, 0))
2063     return;
2064
2065   /* The last chance.  Try recreating the assignment in insn
2066      completely from scratch.  */
2067   set = single_set (insn);
2068   gcc_assert (set);
2069
2070   start_sequence ();
2071   *loc = var;
2072   src = copy_rtx (SET_SRC (set));
2073   dest = copy_rtx (SET_DEST (set));
2074   src = force_operand (src, dest);
2075   if (src != dest)
2076     emit_move_insn (dest, src);
2077   seq = get_insns ();
2078   end_sequence ();
2079
2080   emit_insn_before (seq, insn);
2081   delete_insn (insn);
2082 }
2083
2084
2085 /* Return one expansion of the accumulator recorded in struct VE.  */
2086
2087 static rtx
2088 get_expansion (struct var_to_expand *ve)
2089 {
2090   rtx reg;
2091
2092   if (ve->reuse_expansion == 0)
2093     reg = ve->reg;
2094   else
2095     reg = ve->var_expansions[ve->reuse_expansion - 1];
2096
2097   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2098     ve->reuse_expansion = 0;
2099   else
2100     ve->reuse_expansion++;
2101
2102   return reg;
2103 }
2104
2105
2106 /* Given INSN replace the uses of the accumulator recorded in VE
2107    with a new register.  */
2108
2109 static void
2110 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2111 {
2112   rtx new_reg, set;
2113   bool really_new_expansion = false;
2114
2115   set = single_set (insn);
2116   gcc_assert (set);
2117
2118   /* Generate a new register only if the expansion limit has not been
2119      reached.  Else reuse an already existing expansion.  */
2120   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2121     {
2122       really_new_expansion = true;
2123       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2124     }
2125   else
2126     new_reg = get_expansion (ve);
2127
2128   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2129   if (apply_change_group ())
2130     if (really_new_expansion)
2131       {
2132         ve->var_expansions.safe_push (new_reg);
2133         ve->expansion_count++;
2134       }
2135 }
2136
2137 /* Initialize the variable expansions in loop preheader.  PLACE is the
2138    loop-preheader basic block where the initialization of the
2139    expansions should take place.  The expansions are initialized with
2140    (-0) when the operation is plus or minus to honor sign zero.  This
2141    way we can prevent cases where the sign of the final result is
2142    effected by the sign of the expansion.  Here is an example to
2143    demonstrate this:
2144
2145    for (i = 0 ; i < n; i++)
2146      sum += something;
2147
2148    ==>
2149
2150    sum += something
2151    ....
2152    i = i+1;
2153    sum1 += something
2154    ....
2155    i = i+1
2156    sum2 += something;
2157    ....
2158
2159    When SUM is initialized with -zero and SOMETHING is also -zero; the
2160    final result of sum should be -zero thus the expansions sum1 and sum2
2161    should be initialized with -zero as well (otherwise we will get +zero
2162    as the final result).  */
2163
2164 static void
2165 insert_var_expansion_initialization (struct var_to_expand *ve,
2166                                      basic_block place)
2167 {
2168   rtx seq, var, zero_init;
2169   unsigned i;
2170   enum machine_mode mode = GET_MODE (ve->reg);
2171   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2172
2173   if (ve->var_expansions.length () == 0)
2174     return;
2175
2176   start_sequence ();
2177   switch (ve->op)
2178     {
2179     case FMA:
2180       /* Note that we only accumulate FMA via the ADD operand.  */
2181     case PLUS:
2182     case MINUS:
2183       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2184         {
2185           if (honor_signed_zero_p)
2186             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2187           else
2188             zero_init = CONST0_RTX (mode);
2189           emit_move_insn (var, zero_init);
2190         }
2191       break;
2192
2193     case MULT:
2194       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2195         {
2196           zero_init = CONST1_RTX (GET_MODE (var));
2197           emit_move_insn (var, zero_init);
2198         }
2199       break;
2200
2201     default:
2202       gcc_unreachable ();
2203     }
2204
2205   seq = get_insns ();
2206   end_sequence ();
2207
2208   emit_insn_after (seq, BB_END (place));
2209 }
2210
2211 /* Combine the variable expansions at the loop exit.  PLACE is the
2212    loop exit basic block where the summation of the expansions should
2213    take place.  */
2214
2215 static void
2216 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2217 {
2218   rtx sum = ve->reg;
2219   rtx expr, seq, var, insn;
2220   unsigned i;
2221
2222   if (ve->var_expansions.length () == 0)
2223     return;
2224
2225   start_sequence ();
2226   switch (ve->op)
2227     {
2228     case FMA:
2229       /* Note that we only accumulate FMA via the ADD operand.  */
2230     case PLUS:
2231     case MINUS:
2232       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2233         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2234       break;
2235
2236     case MULT:
2237       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2238         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2239       break;
2240
2241     default:
2242       gcc_unreachable ();
2243     }
2244
2245   expr = force_operand (sum, ve->reg);
2246   if (expr != ve->reg)
2247     emit_move_insn (ve->reg, expr);
2248   seq = get_insns ();
2249   end_sequence ();
2250
2251   insn = BB_HEAD (place);
2252   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2253     insn = NEXT_INSN (insn);
2254
2255   emit_insn_after (seq, insn);
2256 }
2257
2258 /* Strip away REG_EQUAL notes for IVs we're splitting.
2259
2260    Updating REG_EQUAL notes for IVs we split is tricky: We
2261    cannot tell until after unrolling, DF-rescanning, and liveness
2262    updating, whether an EQ_USE is reached by the split IV while
2263    the IV reg is still live.  See PR55006.
2264
2265    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2266    because RTL loop-iv requires us to defer rescanning insns and
2267    any notes attached to them.  So resort to old techniques...  */
2268
2269 static void
2270 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2271 {
2272   struct iv_to_split *ivts;
2273   rtx note = find_reg_equal_equiv_note (insn);
2274   if (! note)
2275     return;
2276   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2277     if (reg_mentioned_p (ivts->orig_var, note))
2278       {
2279         remove_note (insn, note);
2280         return;
2281       }
2282 }
2283
2284 /* Apply loop optimizations in loop copies using the
2285    data which gathered during the unrolling.  Structure
2286    OPT_INFO record that data.
2287
2288    UNROLLING is true if we unrolled (not peeled) the loop.
2289    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2290    the loop (as it should happen in complete unrolling, but not in ordinary
2291    peeling of the loop).  */
2292
2293 static void
2294 apply_opt_in_copies (struct opt_info *opt_info,
2295                      unsigned n_copies, bool unrolling,
2296                      bool rewrite_original_loop)
2297 {
2298   unsigned i, delta;
2299   basic_block bb, orig_bb;
2300   rtx insn, orig_insn, next;
2301   struct iv_to_split ivts_templ, *ivts;
2302   struct var_to_expand ve_templ, *ves;
2303
2304   /* Sanity check -- we need to put initialization in the original loop
2305      body.  */
2306   gcc_assert (!unrolling || rewrite_original_loop);
2307
2308   /* Allocate the basic variables (i0).  */
2309   if (opt_info->insns_to_split)
2310     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2311       allocate_basic_variable (ivts);
2312
2313   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2314     {
2315       bb = BASIC_BLOCK (i);
2316       orig_bb = get_bb_original (bb);
2317
2318       /* bb->aux holds position in copy sequence initialized by
2319          duplicate_loop_to_header_edge.  */
2320       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2321                                         unrolling);
2322       bb->aux = 0;
2323       orig_insn = BB_HEAD (orig_bb);
2324       FOR_BB_INSNS_SAFE (bb, insn, next)
2325         {
2326           if (!INSN_P (insn)
2327               || (DEBUG_INSN_P (insn)
2328                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2329             continue;
2330
2331           while (!INSN_P (orig_insn)
2332                  || (DEBUG_INSN_P (orig_insn)
2333                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2334                          == LABEL_DECL)))
2335             orig_insn = NEXT_INSN (orig_insn);
2336
2337           ivts_templ.insn = orig_insn;
2338           ve_templ.insn = orig_insn;
2339
2340           /* Apply splitting iv optimization.  */
2341           if (opt_info->insns_to_split)
2342             {
2343               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2344
2345               ivts = (struct iv_to_split *)
2346                 htab_find (opt_info->insns_to_split, &ivts_templ);
2347
2348               if (ivts)
2349                 {
2350                   gcc_assert (GET_CODE (PATTERN (insn))
2351                               == GET_CODE (PATTERN (orig_insn)));
2352
2353                   if (!delta)
2354                     insert_base_initialization (ivts, insn);
2355                   split_iv (ivts, insn, delta);
2356                 }
2357             }
2358           /* Apply variable expansion optimization.  */
2359           if (unrolling && opt_info->insns_with_var_to_expand)
2360             {
2361               ves = (struct var_to_expand *)
2362                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2363               if (ves)
2364                 {
2365                   gcc_assert (GET_CODE (PATTERN (insn))
2366                               == GET_CODE (PATTERN (orig_insn)));
2367                   expand_var_during_unrolling (ves, insn);
2368                 }
2369             }
2370           orig_insn = NEXT_INSN (orig_insn);
2371         }
2372     }
2373
2374   if (!rewrite_original_loop)
2375     return;
2376
2377   /* Initialize the variable expansions in the loop preheader
2378      and take care of combining them at the loop exit.  */
2379   if (opt_info->insns_with_var_to_expand)
2380     {
2381       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2382         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2383       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2384         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2385     }
2386
2387   /* Rewrite also the original loop body.  Find them as originals of the blocks
2388      in the last copied iteration, i.e. those that have
2389      get_bb_copy (get_bb_original (bb)) == bb.  */
2390   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2391     {
2392       bb = BASIC_BLOCK (i);
2393       orig_bb = get_bb_original (bb);
2394       if (get_bb_copy (orig_bb) != bb)
2395         continue;
2396
2397       delta = determine_split_iv_delta (0, n_copies, unrolling);
2398       for (orig_insn = BB_HEAD (orig_bb);
2399            orig_insn != NEXT_INSN (BB_END (bb));
2400            orig_insn = next)
2401         {
2402           next = NEXT_INSN (orig_insn);
2403
2404           if (!INSN_P (orig_insn))
2405             continue;
2406
2407           ivts_templ.insn = orig_insn;
2408           if (opt_info->insns_to_split)
2409             {
2410               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2411
2412               ivts = (struct iv_to_split *)
2413                 htab_find (opt_info->insns_to_split, &ivts_templ);
2414               if (ivts)
2415                 {
2416                   if (!delta)
2417                     insert_base_initialization (ivts, orig_insn);
2418                   split_iv (ivts, orig_insn, delta);
2419                   continue;
2420                 }
2421             }
2422
2423         }
2424     }
2425 }
2426
2427 /* Release OPT_INFO.  */
2428
2429 static void
2430 free_opt_info (struct opt_info *opt_info)
2431 {
2432   if (opt_info->insns_to_split)
2433     htab_delete (opt_info->insns_to_split);
2434   if (opt_info->insns_with_var_to_expand)
2435     {
2436       struct var_to_expand *ves;
2437
2438       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2439         ves->var_expansions.release ();
2440       htab_delete (opt_info->insns_with_var_to_expand);
2441     }
2442   free (opt_info);
2443 }