gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36 #include "target.h"
  37
  38 /* This pass performs loop unrolling and peeling.  We only perform these
  39    optimizations on innermost loops (with single exception) because
  40    the impact on performance is greatest here, and we want to avoid
  41    unnecessary code size growth.  The gain is caused by greater sequentiality
  42    of code, better code to optimize for further passes and in some cases
  43    by fewer testings of exit conditions.  The main problem is code growth,
  44    that impacts performance negatively due to effect of caches.
  45
  46    What we do:
  47
  48    -- complete peeling of once-rolling loops; this is the above mentioned
  49       exception, as this causes loop to be cancelled completely and
  50       does not cause code growth
  51    -- complete peeling of loops that roll (small) constant times.
  52    -- simple peeling of first iterations of loops that do not roll much
  53       (according to profile feedback)
  54    -- unrolling of loops that roll constant times; this is almost always
  55       win, as we get rid of exit condition tests.
  56    -- unrolling of loops that roll number of times that we can compute
  57       in runtime; we also get rid of exit condition tests here, but there
  58       is the extra expense for calculating the number of iterations
  59    -- simple unrolling of remaining loops; this is performed only if we
  60       are asked to, as the gain is questionable in this case and often
  61       it may even slow down the code
  62    For more detailed descriptions of each of those, see comments at
  63    appropriate function below.
  64
  65    There is a lot of parameters (defined and described in params.def) that
  66    control how much we unroll/peel.
  67
  68    ??? A great problem is that we don't have a good way how to determine
  69    how many times we should unroll the loop; the experiments I have made
  70    showed that this choice may affect performance in order of several %.
  71    */
  72
  73 /* Information about induction variables to split.  */
  74
  75 struct iv_to_split
  76 {
  77   rtx insn;             /* The insn in that the induction variable occurs.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104   unsigned accum_pos;              /* The position in which the accumulator is placed in
 105                                       the insn src.  For example in x = x + something
 106                                       accum_pos is 0 while in x = something + x accum_pos
 107                                       is 1.  */
 108 };
 109
 110 /* Information about optimization applied in
 111    the unrolled loop.  */
 112
 113 struct opt_info
 114 {
 115   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 116   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 117   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 118   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 119                                       to expand.  */
 120   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 121   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 122   unsigned first_new_block;        /* The first basic block that was
 123                                       duplicated.  */
 124   basic_block loop_exit;           /* The loop exit basic block.  */
 125   basic_block loop_preheader;      /* The loop preheader basic block.  */
 126 };
 127
 128 static void decide_unrolling_and_peeling (int);
 129 static void peel_loops_completely (int);
 130 static void decide_peel_simple (struct loop *, int);
 131 static void decide_peel_once_rolling (struct loop *, int);
 132 static void decide_peel_completely (struct loop *, int);
 133 static void decide_unroll_stupid (struct loop *, int);
 134 static void decide_unroll_constant_iterations (struct loop *, int);
 135 static void decide_unroll_runtime_iterations (struct loop *, int);
 136 static void peel_loop_simple (struct loop *);
 137 static void peel_loop_completely (struct loop *);
 138 static void unroll_loop_stupid (struct loop *);
 139 static void unroll_loop_constant_iterations (struct loop *);
 140 static void unroll_loop_runtime_iterations (struct loop *);
 141 static struct opt_info *analyze_insns_in_loop (struct loop *);
 142 static void opt_info_start_duplication (struct opt_info *);
 143 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 144 static void free_opt_info (struct opt_info *);
 145 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 146 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 147 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 148 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 149 static void insert_var_expansion_initialization (struct var_to_expand *,
 150                                                  basic_block);
 151 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 152                                              basic_block);
 153 static rtx get_expansion (struct var_to_expand *);
 154
 155 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 156 void
 157 unroll_and_peel_loops (int flags)
 158 {
 159   struct loop *loop;
 160   bool check;
 161   loop_iterator li;
 162
 163   /* First perform complete loop peeling (it is almost surely a win,
 164      and affects parameters for further decision a lot).  */
 165   peel_loops_completely (flags);
 166
 167   /* Now decide rest of unrolling and peeling.  */
 168   decide_unrolling_and_peeling (flags);
 169
 170   /* Scan the loops, inner ones first.  */
 171   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 172     {
 173       check = true;
 174       /* And perform the appropriate transformations.  */
 175       switch (loop->lpt_decision.decision)
 176         {
 177         case LPT_PEEL_COMPLETELY:
 178           /* Already done.  */
 179           gcc_unreachable ();
 180         case LPT_PEEL_SIMPLE:
 181           peel_loop_simple (loop);
 182           break;
 183         case LPT_UNROLL_CONSTANT:
 184           unroll_loop_constant_iterations (loop);
 185           break;
 186         case LPT_UNROLL_RUNTIME:
 187           unroll_loop_runtime_iterations (loop);
 188           break;
 189         case LPT_UNROLL_STUPID:
 190           unroll_loop_stupid (loop);
 191           break;
 192         case LPT_NONE:
 193           check = false;
 194           break;
 195         default:
 196           gcc_unreachable ();
 197         }
 198       if (check)
 199         {
 200 #ifdef ENABLE_CHECKING
 201           verify_loop_structure ();
 202 #endif
 203         }
 204     }
 205
 206   iv_analysis_done ();
 207 }
 208
 209 /* Check whether exit of the LOOP is at the end of loop body.  */
 210
 211 static bool
 212 loop_exit_at_end_p (struct loop *loop)
 213 {
 214   struct niter_desc *desc = get_simple_loop_desc (loop);
 215   rtx insn;
 216
 217   if (desc->in_edge->dest != loop->latch)
 218     return false;
 219
 220   /* Check that the latch is empty.  */
 221   FOR_BB_INSNS (loop->latch, insn)
 222     {
 223       if (INSN_P (insn))
 224         return false;
 225     }
 226
 227   return true;
 228 }
 229
 230 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 231 static void
 232 peel_loops_completely (int flags)
 233 {
 234   struct loop *loop;
 235   loop_iterator li;
 236
 237   /* Scan the loops, the inner ones first.  */
 238   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 239     {
 240       loop->lpt_decision.decision = LPT_NONE;
 241
 242       if (dump_file)
 243         fprintf (dump_file,
 244                  "\n;; *** Considering loop %d for complete peeling ***\n",
 245                  loop->num);
 246
 247       loop->ninsns = num_loop_insns (loop);
 248
 249       decide_peel_once_rolling (loop, flags);
 250       if (loop->lpt_decision.decision == LPT_NONE)
 251         decide_peel_completely (loop, flags);
 252
 253       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254         {
 255           peel_loop_completely (loop);
 256 #ifdef ENABLE_CHECKING
 257           verify_loop_structure ();
 258 #endif
 259         }
 260     }
 261 }
 262
 263 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 264 static void
 265 decide_unrolling_and_peeling (int flags)
 266 {
 267   struct loop *loop;
 268   loop_iterator li;
 269
 270   /* Scan the loops, inner ones first.  */
 271   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 272     {
 273       loop->lpt_decision.decision = LPT_NONE;
 274
 275       if (dump_file)
 276         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 277
 278       /* Do not peel cold areas.  */
 279       if (optimize_loop_for_size_p (loop))
 280         {
 281           if (dump_file)
 282             fprintf (dump_file, ";; Not considering loop, cold area\n");
 283           continue;
 284         }
 285
 286       /* Can the loop be manipulated?  */
 287       if (!can_duplicate_loop_p (loop))
 288         {
 289           if (dump_file)
 290             fprintf (dump_file,
 291                      ";; Not considering loop, cannot duplicate\n");
 292           continue;
 293         }
 294
 295       /* Skip non-innermost loops.  */
 296       if (loop->inner)
 297         {
 298           if (dump_file)
 299             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 300           continue;
 301         }
 302
 303       loop->ninsns = num_loop_insns (loop);
 304       loop->av_ninsns = average_num_loop_insns (loop);
 305
 306       /* Try transformations one by one in decreasing order of
 307          priority.  */
 308
 309       decide_unroll_constant_iterations (loop, flags);
 310       if (loop->lpt_decision.decision == LPT_NONE)
 311         decide_unroll_runtime_iterations (loop, flags);
 312       if (loop->lpt_decision.decision == LPT_NONE)
 313         decide_unroll_stupid (loop, flags);
 314       if (loop->lpt_decision.decision == LPT_NONE)
 315         decide_peel_simple (loop, flags);
 316     }
 317 }
 318
 319 /* Decide whether the LOOP is once rolling and suitable for complete
 320    peeling.  */
 321 static void
 322 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 323 {
 324   struct niter_desc *desc;
 325
 326   if (dump_file)
 327     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 328
 329   /* Is the loop small enough?  */
 330   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 331     {
 332       if (dump_file)
 333         fprintf (dump_file, ";; Not considering loop, is too big\n");
 334       return;
 335     }
 336
 337   /* Check for simple loops.  */
 338   desc = get_simple_loop_desc (loop);
 339
 340   /* Check number of iterations.  */
 341   if (!desc->simple_p
 342       || desc->assumptions
 343       || desc->infinite
 344       || !desc->const_iter
 345       || desc->niter != 0)
 346     {
 347       if (dump_file)
 348         fprintf (dump_file,
 349                  ";; Unable to prove that the loop rolls exactly once\n");
 350       return;
 351     }
 352
 353   /* Success.  */
 354   if (dump_file)
 355     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 356   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 357 }
 358
 359 /* Decide whether the LOOP is suitable for complete peeling.  */
 360 static void
 361 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 362 {
 363   unsigned npeel;
 364   struct niter_desc *desc;
 365
 366   if (dump_file)
 367     fprintf (dump_file, "\n;; Considering peeling completely\n");
 368
 369   /* Skip non-innermost loops.  */
 370   if (loop->inner)
 371     {
 372       if (dump_file)
 373         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 374       return;
 375     }
 376
 377   /* Do not peel cold areas.  */
 378   if (optimize_loop_for_size_p (loop))
 379     {
 380       if (dump_file)
 381         fprintf (dump_file, ";; Not considering loop, cold area\n");
 382       return;
 383     }
 384
 385   /* Can the loop be manipulated?  */
 386   if (!can_duplicate_loop_p (loop))
 387     {
 388       if (dump_file)
 389         fprintf (dump_file,
 390                  ";; Not considering loop, cannot duplicate\n");
 391       return;
 392     }
 393
 394   /* npeel = number of iterations to peel.  */
 395   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 396   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 397     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 398
 399   /* Is the loop small enough?  */
 400   if (!npeel)
 401     {
 402       if (dump_file)
 403         fprintf (dump_file, ";; Not considering loop, is too big\n");
 404       return;
 405     }
 406
 407   /* Check for simple loops.  */
 408   desc = get_simple_loop_desc (loop);
 409
 410   /* Check number of iterations.  */
 411   if (!desc->simple_p
 412       || desc->assumptions
 413       || !desc->const_iter
 414       || desc->infinite)
 415     {
 416       if (dump_file)
 417         fprintf (dump_file,
 418                  ";; Unable to prove that the loop iterates constant times\n");
 419       return;
 420     }
 421
 422   if (desc->niter > npeel - 1)
 423     {
 424       if (dump_file)
 425         {
 426           fprintf (dump_file,
 427                    ";; Not peeling loop completely, rolls too much (");
 428           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 429           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 430         }
 431       return;
 432     }
 433
 434   /* Success.  */
 435   if (dump_file)
 436     fprintf (dump_file, ";; Decided to peel loop completely\n");
 437   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 438 }
 439
 440 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 441    completely.  The transformation done:
 442
 443    for (i = 0; i < 4; i++)
 444      body;
 445
 446    ==>
 447
 448    i = 0;
 449    body; i++;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    */
 454 static void
 455 peel_loop_completely (struct loop *loop)
 456 {
 457   sbitmap wont_exit;
 458   unsigned HOST_WIDE_INT npeel;
 459   unsigned i;
 460   VEC (edge, heap) *remove_edges;
 461   edge ein;
 462   struct niter_desc *desc = get_simple_loop_desc (loop);
 463   struct opt_info *opt_info = NULL;
 464
 465   npeel = desc->niter;
 466
 467   if (npeel)
 468     {
 469       bool ok;
 470
 471       wont_exit = sbitmap_alloc (npeel + 1);
 472       sbitmap_ones (wont_exit);
 473       RESET_BIT (wont_exit, 0);
 474       if (desc->noloop_assumptions)
 475         RESET_BIT (wont_exit, 1);
 476
 477       remove_edges = NULL;
 478
 479       if (flag_split_ivs_in_unroller)
 480         opt_info = analyze_insns_in_loop (loop);
 481
 482       opt_info_start_duplication (opt_info);
 483       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 484                                           npeel,
 485                                           wont_exit, desc->out_edge,
 486                                           &remove_edges,
 487                                           DLTHE_FLAG_UPDATE_FREQ
 488                                           | DLTHE_FLAG_COMPLETTE_PEEL
 489                                           | (opt_info
 490                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 491       gcc_assert (ok);
 492
 493       free (wont_exit);
 494
 495       if (opt_info)
 496         {
 497           apply_opt_in_copies (opt_info, npeel, false, true);
 498           free_opt_info (opt_info);
 499         }
 500
 501       /* Remove the exit edges.  */
 502       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 503         remove_path (ein);
 504       VEC_free (edge, heap, remove_edges);
 505     }
 506
 507   ein = desc->in_edge;
 508   free_simple_loop_desc (loop);
 509
 510   /* Now remove the unreachable part of the last iteration and cancel
 511      the loop.  */
 512   remove_path (ein);
 513
 514   if (dump_file)
 515     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 516 }
 517
 518 /* Decide whether to unroll LOOP iterating constant number of times
 519    and how much.  */
 520
 521 static void
 522 decide_unroll_constant_iterations (struct loop *loop, int flags)
 523 {
 524   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 525   struct niter_desc *desc;
 526
 527   if (!(flags & UAP_UNROLL))
 528     {
 529       /* We were not asked to, just return back silently.  */
 530       return;
 531     }
 532
 533   if (dump_file)
 534     fprintf (dump_file,
 535              "\n;; Considering unrolling loop with constant "
 536              "number of iterations\n");
 537
 538   /* nunroll = total number of copies of the original loop body in
 539      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 540   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 541   nunroll_by_av
 542     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 543   if (nunroll > nunroll_by_av)
 544     nunroll = nunroll_by_av;
 545   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 546     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 547
 548   /* Skip big loops.  */
 549   if (nunroll <= 1)
 550     {
 551       if (dump_file)
 552         fprintf (dump_file, ";; Not considering loop, is too big\n");
 553       return;
 554     }
 555
 556   /* Check for simple loops.  */
 557   desc = get_simple_loop_desc (loop);
 558
 559   /* Check number of iterations.  */
 560   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 561     {
 562       if (dump_file)
 563         fprintf (dump_file,
 564                  ";; Unable to prove that the loop iterates constant times\n");
 565       return;
 566     }
 567
 568   /* Check whether the loop rolls enough to consider.  */
 569   if (desc->niter < 2 * nunroll)
 570     {
 571       if (dump_file)
 572         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 573       return;
 574     }
 575
 576   /* Success; now compute number of iterations to unroll.  We alter
 577      nunroll so that as few as possible copies of loop body are
 578      necessary, while still not decreasing the number of unrollings
 579      too much (at most by 1).  */
 580   best_copies = 2 * nunroll + 10;
 581
 582   i = 2 * nunroll + 2;
 583   if (i - 1 >= desc->niter)
 584     i = desc->niter - 2;
 585
 586   for (; i >= nunroll - 1; i--)
 587     {
 588       unsigned exit_mod = desc->niter % (i + 1);
 589
 590       if (!loop_exit_at_end_p (loop))
 591         n_copies = exit_mod + i + 1;
 592       else if (exit_mod != (unsigned) i
 593                || desc->noloop_assumptions != NULL_RTX)
 594         n_copies = exit_mod + i + 2;
 595       else
 596         n_copies = i + 1;
 597
 598       if (n_copies < best_copies)
 599         {
 600           best_copies = n_copies;
 601           best_unroll = i;
 602         }
 603     }
 604
 605   if (dump_file)
 606     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 607              best_unroll + 1, best_copies, nunroll);
 608
 609   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 610   loop->lpt_decision.times = best_unroll;
 611
 612   if (dump_file)
 613     fprintf (dump_file,
 614              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 615              loop->lpt_decision.times);
 616 }
 617
 618 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 619    times.  The transformation does this:
 620
 621    for (i = 0; i < 102; i++)
 622      body;
 623
 624    ==>
 625
 626    i = 0;
 627    body; i++;
 628    body; i++;
 629    while (i < 102)
 630      {
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635      }
 636   */
 637 static void
 638 unroll_loop_constant_iterations (struct loop *loop)
 639 {
 640   unsigned HOST_WIDE_INT niter;
 641   unsigned exit_mod;
 642   sbitmap wont_exit;
 643   unsigned i;
 644   VEC (edge, heap) *remove_edges;
 645   edge e;
 646   unsigned max_unroll = loop->lpt_decision.times;
 647   struct niter_desc *desc = get_simple_loop_desc (loop);
 648   bool exit_at_end = loop_exit_at_end_p (loop);
 649   struct opt_info *opt_info = NULL;
 650   bool ok;
 651
 652   niter = desc->niter;
 653
 654   /* Should not get here (such loop should be peeled instead).  */
 655   gcc_assert (niter > max_unroll + 1);
 656
 657   exit_mod = niter % (max_unroll + 1);
 658
 659   wont_exit = sbitmap_alloc (max_unroll + 1);
 660   sbitmap_ones (wont_exit);
 661
 662   remove_edges = NULL;
 663   if (flag_split_ivs_in_unroller
 664       || flag_variable_expansion_in_unroller)
 665     opt_info = analyze_insns_in_loop (loop);
 666
 667   if (!exit_at_end)
 668     {
 669       /* The exit is not at the end of the loop; leave exit test
 670          in the first copy, so that the loops that start with test
 671          of exit condition have continuous body after unrolling.  */
 672
 673       if (dump_file)
 674         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 675
 676       /* Peel exit_mod iterations.  */
 677       RESET_BIT (wont_exit, 0);
 678       if (desc->noloop_assumptions)
 679         RESET_BIT (wont_exit, 1);
 680
 681       if (exit_mod)
 682         {
 683           opt_info_start_duplication (opt_info);
 684           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 685                                               exit_mod,
 686                                               wont_exit, desc->out_edge,
 687                                               &remove_edges,
 688                                               DLTHE_FLAG_UPDATE_FREQ
 689                                               | (opt_info && exit_mod > 1
 690                                                  ? DLTHE_RECORD_COPY_NUMBER
 691                                                    : 0));
 692           gcc_assert (ok);
 693
 694           if (opt_info && exit_mod > 1)
 695             apply_opt_in_copies (opt_info, exit_mod, false, false);
 696
 697           desc->noloop_assumptions = NULL_RTX;
 698           desc->niter -= exit_mod;
 699           desc->niter_max -= exit_mod;
 700         }
 701
 702       SET_BIT (wont_exit, 1);
 703     }
 704   else
 705     {
 706       /* Leave exit test in last copy, for the same reason as above if
 707          the loop tests the condition at the end of loop body.  */
 708
 709       if (dump_file)
 710         fprintf (dump_file, ";; Condition on end of loop.\n");
 711
 712       /* We know that niter >= max_unroll + 2; so we do not need to care of
 713          case when we would exit before reaching the loop.  So just peel
 714          exit_mod + 1 iterations.  */
 715       if (exit_mod != max_unroll
 716           || desc->noloop_assumptions)
 717         {
 718           RESET_BIT (wont_exit, 0);
 719           if (desc->noloop_assumptions)
 720             RESET_BIT (wont_exit, 1);
 721
 722           opt_info_start_duplication (opt_info);
 723           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 724                                               exit_mod + 1,
 725                                               wont_exit, desc->out_edge,
 726                                               &remove_edges,
 727                                               DLTHE_FLAG_UPDATE_FREQ
 728                                               | (opt_info && exit_mod > 0
 729                                                  ? DLTHE_RECORD_COPY_NUMBER
 730                                                    : 0));
 731           gcc_assert (ok);
 732
 733           if (opt_info && exit_mod > 0)
 734             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 735
 736           desc->niter -= exit_mod + 1;
 737           desc->niter_max -= exit_mod + 1;
 738           desc->noloop_assumptions = NULL_RTX;
 739
 740           SET_BIT (wont_exit, 0);
 741           SET_BIT (wont_exit, 1);
 742         }
 743
 744       RESET_BIT (wont_exit, max_unroll);
 745     }
 746
 747   /* Now unroll the loop.  */
 748
 749   opt_info_start_duplication (opt_info);
 750   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 751                                       max_unroll,
 752                                       wont_exit, desc->out_edge,
 753                                       &remove_edges,
 754                                       DLTHE_FLAG_UPDATE_FREQ
 755                                       | (opt_info
 756                                          ? DLTHE_RECORD_COPY_NUMBER
 757                                            : 0));
 758   gcc_assert (ok);
 759
 760   if (opt_info)
 761     {
 762       apply_opt_in_copies (opt_info, max_unroll, true, true);
 763       free_opt_info (opt_info);
 764     }
 765
 766   free (wont_exit);
 767
 768   if (exit_at_end)
 769     {
 770       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 771       /* Find a new in and out edge; they are in the last copy we have made.  */
 772
 773       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 774         {
 775           desc->out_edge = EDGE_SUCC (exit_block, 0);
 776           desc->in_edge = EDGE_SUCC (exit_block, 1);
 777         }
 778       else
 779         {
 780           desc->out_edge = EDGE_SUCC (exit_block, 1);
 781           desc->in_edge = EDGE_SUCC (exit_block, 0);
 782         }
 783     }
 784
 785   desc->niter /= max_unroll + 1;
 786   desc->niter_max /= max_unroll + 1;
 787   desc->niter_expr = GEN_INT (desc->niter);
 788
 789   /* Remove the edges.  */
 790   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 791     remove_path (e);
 792   VEC_free (edge, heap, remove_edges);
 793
 794   if (dump_file)
 795     fprintf (dump_file,
 796              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 797              max_unroll, num_loop_insns (loop));
 798 }
 799
 800 /* Decide whether to unroll LOOP iterating runtime computable number of times
 801    and how much.  */
 802 static void
 803 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 804 {
 805   unsigned nunroll, nunroll_by_av, i;
 806   struct niter_desc *desc;
 807
 808   if (!(flags & UAP_UNROLL))
 809     {
 810       /* We were not asked to, just return back silently.  */
 811       return;
 812     }
 813
 814   if (dump_file)
 815     fprintf (dump_file,
 816              "\n;; Considering unrolling loop with runtime "
 817              "computable number of iterations\n");
 818
 819   /* nunroll = total number of copies of the original loop body in
 820      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 821   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 822   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 823   if (nunroll > nunroll_by_av)
 824     nunroll = nunroll_by_av;
 825   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 826     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 827
 828   if (targetm.loop_unroll_adjust)
 829     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 830
 831   /* Skip big loops.  */
 832   if (nunroll <= 1)
 833     {
 834       if (dump_file)
 835         fprintf (dump_file, ";; Not considering loop, is too big\n");
 836       return;
 837     }
 838
 839   /* Check for simple loops.  */
 840   desc = get_simple_loop_desc (loop);
 841
 842   /* Check simpleness.  */
 843   if (!desc->simple_p || desc->assumptions)
 844     {
 845       if (dump_file)
 846         fprintf (dump_file,
 847                  ";; Unable to prove that the number of iterations "
 848                  "can be counted in runtime\n");
 849       return;
 850     }
 851
 852   if (desc->const_iter)
 853     {
 854       if (dump_file)
 855         fprintf (dump_file, ";; Loop iterates constant times\n");
 856       return;
 857     }
 858
 859   /* If we have profile feedback, check whether the loop rolls.  */
 860   if ((loop->header->count
 861        && expected_loop_iterations (loop) < 2 * nunroll)
 862       || desc->niter_max < 2 * nunroll)
 863     {
 864       if (dump_file)
 865         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 866       return;
 867     }
 868
 869   /* Success; now force nunroll to be power of 2, as we are unable to
 870      cope with overflows in computation of number of iterations.  */
 871   for (i = 1; 2 * i <= nunroll; i *= 2)
 872     continue;
 873
 874   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 875   loop->lpt_decision.times = i - 1;
 876
 877   if (dump_file)
 878     fprintf (dump_file,
 879              ";; Decided to unroll the runtime computable "
 880              "times rolling loop, %d times.\n",
 881              loop->lpt_decision.times);
 882 }
 883
 884 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 885    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 886    and NULL is returned instead.  */
 887
 888 basic_block
 889 split_edge_and_insert (edge e, rtx insns)
 890 {
 891   basic_block bb;
 892
 893   if (!insns)
 894     return NULL;
 895   bb = split_edge (e);
 896   emit_insn_after (insns, BB_END (bb));
 897
 898   /* ??? We used to assume that INSNS can contain control flow insns, and
 899      that we had to try to find sub basic blocks in BB to maintain a valid
 900      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 901      and call break_superblocks when going out of cfglayout mode.  But it
 902      turns out that this never happens; and that if it does ever happen,
 903      the TODO_verify_flow at the end of the RTL loop passes would fail.
 904
 905      There are two reasons why we expected we could have control flow insns
 906      in INSNS.  The first is when a comparison has to be done in parts, and
 907      the second is when the number of iterations is computed for loops with
 908      the number of iterations known at runtime.  In both cases, test cases
 909      to get control flow in INSNS appear to be impossible to construct:
 910
 911       * If do_compare_rtx_and_jump needs several branches to do comparison
 912         in a mode that needs comparison by parts, we cannot analyze the
 913         number of iterations of the loop, and we never get to unrolling it.
 914
 915       * The code in expand_divmod that was suspected to cause creation of
 916         branching code seems to be only accessed for signed division.  The
 917         divisions used by # of iterations analysis are always unsigned.
 918         Problems might arise on architectures that emits branching code
 919         for some operations that may appear in the unroller (especially
 920         for division), but we have no such architectures.
 921
 922      Considering all this, it was decided that we should for now assume
 923      that INSNS can in theory contain control flow insns, but in practice
 924      it never does.  So we don't handle the theoretical case, and should
 925      a real failure ever show up, we have a pretty good clue for how to
 926      fix it.  */
 927
 928   return bb;
 929 }
 930
 931 /* Unroll LOOP for that we are able to count number of iterations in runtime
 932    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 933    extra care for case n < 0):
 934
 935    for (i = 0; i < n; i++)
 936      body;
 937
 938    ==>
 939
 940    i = 0;
 941    mod = n % 4;
 942
 943    switch (mod)
 944      {
 945        case 3:
 946          body; i++;
 947        case 2:
 948          body; i++;
 949        case 1:
 950          body; i++;
 951        case 0: ;
 952      }
 953
 954    while (i < n)
 955      {
 956        body; i++;
 957        body; i++;
 958        body; i++;
 959        body; i++;
 960      }
 961    */
 962 static void
 963 unroll_loop_runtime_iterations (struct loop *loop)
 964 {
 965   rtx old_niter, niter, init_code, branch_code, tmp;
 966   unsigned i, j, p;
 967   basic_block preheader, *body, swtch, ezc_swtch;
 968   VEC (basic_block, heap) *dom_bbs;
 969   sbitmap wont_exit;
 970   int may_exit_copy;
 971   unsigned n_peel;
 972   VEC (edge, heap) *remove_edges;
 973   edge e;
 974   bool extra_zero_check, last_may_exit;
 975   unsigned max_unroll = loop->lpt_decision.times;
 976   struct niter_desc *desc = get_simple_loop_desc (loop);
 977   bool exit_at_end = loop_exit_at_end_p (loop);
 978   struct opt_info *opt_info = NULL;
 979   bool ok;
 980
 981   if (flag_split_ivs_in_unroller
 982       || flag_variable_expansion_in_unroller)
 983     opt_info = analyze_insns_in_loop (loop);
 984
 985   /* Remember blocks whose dominators will have to be updated.  */
 986   dom_bbs = NULL;
 987
 988   body = get_loop_body (loop);
 989   for (i = 0; i < loop->num_nodes; i++)
 990     {
 991       VEC (basic_block, heap) *ldom;
 992       basic_block bb;
 993
 994       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 995       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
 996         if (!flow_bb_inside_loop_p (loop, bb))
 997           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 998
 999       VEC_free (basic_block, heap, ldom);
1000     }
1001   free (body);
1002
1003   if (!exit_at_end)
1004     {
1005       /* Leave exit in first copy (for explanation why see comment in
1006          unroll_loop_constant_iterations).  */
1007       may_exit_copy = 0;
1008       n_peel = max_unroll - 1;
1009       extra_zero_check = true;
1010       last_may_exit = false;
1011     }
1012   else
1013     {
1014       /* Leave exit in last copy (for explanation why see comment in
1015          unroll_loop_constant_iterations).  */
1016       may_exit_copy = max_unroll;
1017       n_peel = max_unroll;
1018       extra_zero_check = false;
1019       last_may_exit = true;
1020     }
1021
1022   /* Get expression for number of iterations.  */
1023   start_sequence ();
1024   old_niter = niter = gen_reg_rtx (desc->mode);
1025   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1026   if (tmp != niter)
1027     emit_move_insn (niter, tmp);
1028
1029   /* Count modulo by ANDing it with max_unroll; we use the fact that
1030      the number of unrollings is a power of two, and thus this is correct
1031      even if there is overflow in the computation.  */
1032   niter = expand_simple_binop (desc->mode, AND,
1033                                niter,
1034                                GEN_INT (max_unroll),
1035                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1036
1037   init_code = get_insns ();
1038   end_sequence ();
1039   unshare_all_rtl_in_chain (init_code);
1040
1041   /* Precondition the loop.  */
1042   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1043
1044   remove_edges = NULL;
1045
1046   wont_exit = sbitmap_alloc (max_unroll + 2);
1047
1048   /* Peel the first copy of loop body (almost always we must leave exit test
1049      here; the only exception is when we have extra zero check and the number
1050      of iterations is reliable.  Also record the place of (possible) extra
1051      zero check.  */
1052   sbitmap_zero (wont_exit);
1053   if (extra_zero_check
1054       && !desc->noloop_assumptions)
1055     SET_BIT (wont_exit, 1);
1056   ezc_swtch = loop_preheader_edge (loop)->src;
1057   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1058                                       1, wont_exit, desc->out_edge,
1059                                       &remove_edges,
1060                                       DLTHE_FLAG_UPDATE_FREQ);
1061   gcc_assert (ok);
1062
1063   /* Record the place where switch will be built for preconditioning.  */
1064   swtch = split_edge (loop_preheader_edge (loop));
1065
1066   for (i = 0; i < n_peel; i++)
1067     {
1068       /* Peel the copy.  */
1069       sbitmap_zero (wont_exit);
1070       if (i != n_peel - 1 || !last_may_exit)
1071         SET_BIT (wont_exit, 1);
1072       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1073                                           1, wont_exit, desc->out_edge,
1074                                           &remove_edges,
1075                                           DLTHE_FLAG_UPDATE_FREQ);
1076       gcc_assert (ok);
1077
1078       /* Create item for switch.  */
1079       j = n_peel - i - (extra_zero_check ? 0 : 1);
1080       p = REG_BR_PROB_BASE / (i + 2);
1081
1082       preheader = split_edge (loop_preheader_edge (loop));
1083       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1084                                           block_label (preheader), p,
1085                                           NULL_RTX);
1086
1087       /* We rely on the fact that the compare and jump cannot be optimized out,
1088          and hence the cfg we create is correct.  */
1089       gcc_assert (branch_code != NULL_RTX);
1090
1091       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1092       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1093       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1094       e = make_edge (swtch, preheader,
1095                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1096       e->probability = p;
1097     }
1098
1099   if (extra_zero_check)
1100     {
1101       /* Add branch for zero iterations.  */
1102       p = REG_BR_PROB_BASE / (max_unroll + 1);
1103       swtch = ezc_swtch;
1104       preheader = split_edge (loop_preheader_edge (loop));
1105       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1106                                           block_label (preheader), p,
1107                                           NULL_RTX);
1108       gcc_assert (branch_code != NULL_RTX);
1109
1110       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1111       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1112       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1113       e = make_edge (swtch, preheader,
1114                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1115       e->probability = p;
1116     }
1117
1118   /* Recount dominators for outer blocks.  */
1119   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1120
1121   /* And unroll loop.  */
1122
1123   sbitmap_ones (wont_exit);
1124   RESET_BIT (wont_exit, may_exit_copy);
1125   opt_info_start_duplication (opt_info);
1126
1127   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1128                                       max_unroll,
1129                                       wont_exit, desc->out_edge,
1130                                       &remove_edges,
1131                                       DLTHE_FLAG_UPDATE_FREQ
1132                                       | (opt_info
1133                                          ? DLTHE_RECORD_COPY_NUMBER
1134                                            : 0));
1135   gcc_assert (ok);
1136
1137   if (opt_info)
1138     {
1139       apply_opt_in_copies (opt_info, max_unroll, true, true);
1140       free_opt_info (opt_info);
1141     }
1142
1143   free (wont_exit);
1144
1145   if (exit_at_end)
1146     {
1147       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1148       /* Find a new in and out edge; they are in the last copy we have
1149          made.  */
1150
1151       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1152         {
1153           desc->out_edge = EDGE_SUCC (exit_block, 0);
1154           desc->in_edge = EDGE_SUCC (exit_block, 1);
1155         }
1156       else
1157         {
1158           desc->out_edge = EDGE_SUCC (exit_block, 1);
1159           desc->in_edge = EDGE_SUCC (exit_block, 0);
1160         }
1161     }
1162
1163   /* Remove the edges.  */
1164   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1165     remove_path (e);
1166   VEC_free (edge, heap, remove_edges);
1167
1168   /* We must be careful when updating the number of iterations due to
1169      preconditioning and the fact that the value must be valid at entry
1170      of the loop.  After passing through the above code, we see that
1171      the correct new number of iterations is this:  */
1172   gcc_assert (!desc->const_iter);
1173   desc->niter_expr =
1174     simplify_gen_binary (UDIV, desc->mode, old_niter,
1175                          GEN_INT (max_unroll + 1));
1176   desc->niter_max /= max_unroll + 1;
1177   if (exit_at_end)
1178     {
1179       desc->niter_expr =
1180         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1181       desc->noloop_assumptions = NULL_RTX;
1182       desc->niter_max--;
1183     }
1184
1185   if (dump_file)
1186     fprintf (dump_file,
1187              ";; Unrolled loop %d times, counting # of iterations "
1188              "in runtime, %i insns\n",
1189              max_unroll, num_loop_insns (loop));
1190
1191   VEC_free (basic_block, heap, dom_bbs);
1192 }
1193
1194 /* Decide whether to simply peel LOOP and how much.  */
1195 static void
1196 decide_peel_simple (struct loop *loop, int flags)
1197 {
1198   unsigned npeel;
1199   struct niter_desc *desc;
1200
1201   if (!(flags & UAP_PEEL))
1202     {
1203       /* We were not asked to, just return back silently.  */
1204       return;
1205     }
1206
1207   if (dump_file)
1208     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1209
1210   /* npeel = number of iterations to peel.  */
1211   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1212   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1213     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1214
1215   /* Skip big loops.  */
1216   if (!npeel)
1217     {
1218       if (dump_file)
1219         fprintf (dump_file, ";; Not considering loop, is too big\n");
1220       return;
1221     }
1222
1223   /* Check for simple loops.  */
1224   desc = get_simple_loop_desc (loop);
1225
1226   /* Check number of iterations.  */
1227   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1228     {
1229       if (dump_file)
1230         fprintf (dump_file, ";; Loop iterates constant times\n");
1231       return;
1232     }
1233
1234   /* Do not simply peel loops with branches inside -- it increases number
1235      of mispredicts.  */
1236   if (num_loop_branches (loop) > 1)
1237     {
1238       if (dump_file)
1239         fprintf (dump_file, ";; Not peeling, contains branches\n");
1240       return;
1241     }
1242
1243   if (loop->header->count)
1244     {
1245       unsigned niter = expected_loop_iterations (loop);
1246       if (niter + 1 > npeel)
1247         {
1248           if (dump_file)
1249             {
1250               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1251               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1252                        (HOST_WIDEST_INT) (niter + 1));
1253               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1254                        npeel);
1255             }
1256           return;
1257         }
1258       npeel = niter + 1;
1259     }
1260   else
1261     {
1262       /* For now we have no good heuristics to decide whether loop peeling
1263          will be effective, so disable it.  */
1264       if (dump_file)
1265         fprintf (dump_file,
1266                  ";; Not peeling loop, no evidence it will be profitable\n");
1267       return;
1268     }
1269
1270   /* Success.  */
1271   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1272   loop->lpt_decision.times = npeel;
1273
1274   if (dump_file)
1275     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1276              loop->lpt_decision.times);
1277 }
1278
1279 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1280    while (cond)
1281      body;
1282
1283    ==>
1284
1285    if (!cond) goto end;
1286    body;
1287    if (!cond) goto end;
1288    body;
1289    while (cond)
1290      body;
1291    end: ;
1292    */
1293 static void
1294 peel_loop_simple (struct loop *loop)
1295 {
1296   sbitmap wont_exit;
1297   unsigned npeel = loop->lpt_decision.times;
1298   struct niter_desc *desc = get_simple_loop_desc (loop);
1299   struct opt_info *opt_info = NULL;
1300   bool ok;
1301
1302   if (flag_split_ivs_in_unroller && npeel > 1)
1303     opt_info = analyze_insns_in_loop (loop);
1304
1305   wont_exit = sbitmap_alloc (npeel + 1);
1306   sbitmap_zero (wont_exit);
1307
1308   opt_info_start_duplication (opt_info);
1309
1310   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1311                                       npeel, wont_exit, NULL,
1312                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1313                                       | (opt_info
1314                                          ? DLTHE_RECORD_COPY_NUMBER
1315                                            : 0));
1316   gcc_assert (ok);
1317
1318   free (wont_exit);
1319
1320   if (opt_info)
1321     {
1322       apply_opt_in_copies (opt_info, npeel, false, false);
1323       free_opt_info (opt_info);
1324     }
1325
1326   if (desc->simple_p)
1327     {
1328       if (desc->const_iter)
1329         {
1330           desc->niter -= npeel;
1331           desc->niter_expr = GEN_INT (desc->niter);
1332           desc->noloop_assumptions = NULL_RTX;
1333         }
1334       else
1335         {
1336           /* We cannot just update niter_expr, as its value might be clobbered
1337              inside loop.  We could handle this by counting the number into
1338              temporary just like we do in runtime unrolling, but it does not
1339              seem worthwhile.  */
1340           free_simple_loop_desc (loop);
1341         }
1342     }
1343   if (dump_file)
1344     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1345 }
1346
1347 /* Decide whether to unroll LOOP stupidly and how much.  */
1348 static void
1349 decide_unroll_stupid (struct loop *loop, int flags)
1350 {
1351   unsigned nunroll, nunroll_by_av, i;
1352   struct niter_desc *desc;
1353
1354   if (!(flags & UAP_UNROLL_ALL))
1355     {
1356       /* We were not asked to, just return back silently.  */
1357       return;
1358     }
1359
1360   if (dump_file)
1361     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1362
1363   /* nunroll = total number of copies of the original loop body in
1364      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1365   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1366   nunroll_by_av
1367     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1368   if (nunroll > nunroll_by_av)
1369     nunroll = nunroll_by_av;
1370   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1371     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1372
1373   if (targetm.loop_unroll_adjust)
1374     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1375
1376   /* Skip big loops.  */
1377   if (nunroll <= 1)
1378     {
1379       if (dump_file)
1380         fprintf (dump_file, ";; Not considering loop, is too big\n");
1381       return;
1382     }
1383
1384   /* Check for simple loops.  */
1385   desc = get_simple_loop_desc (loop);
1386
1387   /* Check simpleness.  */
1388   if (desc->simple_p && !desc->assumptions)
1389     {
1390       if (dump_file)
1391         fprintf (dump_file, ";; The loop is simple\n");
1392       return;
1393     }
1394
1395   /* Do not unroll loops with branches inside -- it increases number
1396      of mispredicts.  */
1397   if (num_loop_branches (loop) > 1)
1398     {
1399       if (dump_file)
1400         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1401       return;
1402     }
1403
1404   /* If we have profile feedback, check whether the loop rolls.  */
1405   if ((loop->header->count
1406        && expected_loop_iterations (loop) < 2 * nunroll)
1407       || desc->niter_max < 2 * nunroll)
1408     {
1409       if (dump_file)
1410         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1411       return;
1412     }
1413
1414   /* Success.  Now force nunroll to be power of 2, as it seems that this
1415      improves results (partially because of better alignments, partially
1416      because of some dark magic).  */
1417   for (i = 1; 2 * i <= nunroll; i *= 2)
1418     continue;
1419
1420   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1421   loop->lpt_decision.times = i - 1;
1422
1423   if (dump_file)
1424     fprintf (dump_file,
1425              ";; Decided to unroll the loop stupidly, %d times.\n",
1426              loop->lpt_decision.times);
1427 }
1428
1429 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1430    while (cond)
1431      body;
1432
1433    ==>
1434
1435    while (cond)
1436      {
1437        body;
1438        if (!cond) break;
1439        body;
1440        if (!cond) break;
1441        body;
1442        if (!cond) break;
1443        body;
1444      }
1445    */
1446 static void
1447 unroll_loop_stupid (struct loop *loop)
1448 {
1449   sbitmap wont_exit;
1450   unsigned nunroll = loop->lpt_decision.times;
1451   struct niter_desc *desc = get_simple_loop_desc (loop);
1452   struct opt_info *opt_info = NULL;
1453   bool ok;
1454
1455   if (flag_split_ivs_in_unroller
1456       || flag_variable_expansion_in_unroller)
1457     opt_info = analyze_insns_in_loop (loop);
1458
1459
1460   wont_exit = sbitmap_alloc (nunroll + 1);
1461   sbitmap_zero (wont_exit);
1462   opt_info_start_duplication (opt_info);
1463
1464   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1465                                       nunroll, wont_exit,
1466                                       NULL, NULL,
1467                                       DLTHE_FLAG_UPDATE_FREQ
1468                                       | (opt_info
1469                                          ? DLTHE_RECORD_COPY_NUMBER
1470                                            : 0));
1471   gcc_assert (ok);
1472
1473   if (opt_info)
1474     {
1475       apply_opt_in_copies (opt_info, nunroll, true, true);
1476       free_opt_info (opt_info);
1477     }
1478
1479   free (wont_exit);
1480
1481   if (desc->simple_p)
1482     {
1483       /* We indeed may get here provided that there are nontrivial assumptions
1484          for a loop to be really simple.  We could update the counts, but the
1485          problem is that we are unable to decide which exit will be taken
1486          (not really true in case the number of iterations is constant,
1487          but noone will do anything with this information, so we do not
1488          worry about it).  */
1489       desc->simple_p = false;
1490     }
1491
1492   if (dump_file)
1493     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1494              nunroll, num_loop_insns (loop));
1495 }
1496
1497 /* A hash function for information about insns to split.  */
1498
1499 static hashval_t
1500 si_info_hash (const void *ivts)
1501 {
1502   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1503 }
1504
1505 /* An equality functions for information about insns to split.  */
1506
1507 static int
1508 si_info_eq (const void *ivts1, const void *ivts2)
1509 {
1510   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1511   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1512
1513   return i1->insn == i2->insn;
1514 }
1515
1516 /* Return a hash for VES, which is really a "var_to_expand *".  */
1517
1518 static hashval_t
1519 ve_info_hash (const void *ves)
1520 {
1521   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1522 }
1523
1524 /* Return true if IVTS1 and IVTS2 (which are really both of type
1525    "var_to_expand *") refer to the same instruction.  */
1526
1527 static int
1528 ve_info_eq (const void *ivts1, const void *ivts2)
1529 {
1530   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1531   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1532
1533   return i1->insn == i2->insn;
1534 }
1535
1536 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1537    Set *DEBUG_USES to the number of debug insns that reference the
1538    variable.  */
1539
1540 bool
1541 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1542                                   int *debug_uses)
1543 {
1544   basic_block *body, bb;
1545   unsigned i;
1546   int count_ref = 0;
1547   rtx insn;
1548
1549   body = get_loop_body (loop);
1550   for (i = 0; i < loop->num_nodes; i++)
1551     {
1552       bb = body[i];
1553
1554       FOR_BB_INSNS (bb, insn)
1555         if (!rtx_referenced_p (reg, insn))
1556           continue;
1557         else if (DEBUG_INSN_P (insn))
1558           ++*debug_uses;
1559         else if (++count_ref > 1)
1560           break;
1561     }
1562   free (body);
1563   return (count_ref  == 1);
1564 }
1565
1566 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1567
1568 static void
1569 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1570 {
1571   basic_block *body, bb;
1572   unsigned i;
1573   rtx insn;
1574
1575   body = get_loop_body (loop);
1576   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1577     {
1578       bb = body[i];
1579
1580       FOR_BB_INSNS (bb, insn)
1581         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1582           continue;
1583         else
1584           {
1585             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1586                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1587             if (!--debug_uses)
1588               break;
1589           }
1590     }
1591   free (body);
1592 }
1593
1594 /* Determine whether INSN contains an accumulator
1595    which can be expanded into separate copies,
1596    one for each copy of the LOOP body.
1597
1598    for (i = 0 ; i < n; i++)
1599      sum += a[i];
1600
1601    ==>
1602
1603    sum += a[i]
1604    ....
1605    i = i+1;
1606    sum1 += a[i]
1607    ....
1608    i = i+1
1609    sum2 += a[i];
1610    ....
1611
1612    Return NULL if INSN contains no opportunity for expansion of accumulator.
1613    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1614    information and return a pointer to it.
1615 */
1616
1617 static struct var_to_expand *
1618 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1619 {
1620   rtx set, dest, src;
1621   struct var_to_expand *ves;
1622   unsigned accum_pos;
1623   enum rtx_code code;
1624   int debug_uses = 0;
1625
1626   set = single_set (insn);
1627   if (!set)
1628     return NULL;
1629
1630   dest = SET_DEST (set);
1631   src = SET_SRC (set);
1632   code = GET_CODE (src);
1633
1634   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1635     return NULL;
1636
1637   if (FLOAT_MODE_P (GET_MODE (dest)))
1638     {
1639       if (!flag_associative_math)
1640         return NULL;
1641       /* In the case of FMA, we're also changing the rounding.  */
1642       if (code == FMA && !flag_unsafe_math_optimizations)
1643         return NULL;
1644     }
1645
1646   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1647      in MD.  But if there is no optab to generate the insn, we can not
1648      perform the variable expansion.  This can happen if an MD provides
1649      an insn but not a named pattern to generate it, for example to avoid
1650      producing code that needs additional mode switches like for x87/mmx.
1651
1652      So we check have_insn_for which looks for an optab for the operation
1653      in SRC.  If it doesn't exist, we can't perform the expansion even
1654      though INSN is valid.  */
1655   if (!have_insn_for (code, GET_MODE (src)))
1656     return NULL;
1657
1658   if (!REG_P (dest)
1659       && !(GET_CODE (dest) == SUBREG
1660            && REG_P (SUBREG_REG (dest))))
1661     return NULL;
1662
1663   /* Find the accumulator use within the operation.  */
1664   if (code == FMA)
1665     {
1666       /* We only support accumulation via FMA in the ADD position.  */
1667       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1668         return NULL;
1669       accum_pos = 2;
1670     }
1671   else if (rtx_equal_p (dest, XEXP (src, 0)))
1672     accum_pos = 0;
1673   else if (rtx_equal_p (dest, XEXP (src, 1)))
1674     {
1675       /* The method of expansion that we are using; which includes the
1676          initialization of the expansions with zero and the summation of
1677          the expansions at the end of the computation will yield wrong
1678          results for (x = something - x) thus avoid using it in that case.  */
1679       if (code == MINUS)
1680         return NULL;
1681       accum_pos = 1;
1682     }
1683   else
1684     return NULL;
1685
1686   /* It must not otherwise be used.  */
1687   if (code == FMA)
1688     {
1689       if (rtx_referenced_p (dest, XEXP (src, 0))
1690           || rtx_referenced_p (dest, XEXP (src, 1)))
1691         return NULL;
1692     }
1693   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1694     return NULL;
1695
1696   /* It must be used in exactly one insn.  */
1697   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1698     return NULL;
1699
1700   if (dump_file)
1701     {
1702       fprintf (dump_file, "\n;; Expanding Accumulator ");
1703       print_rtl (dump_file, dest);
1704       fprintf (dump_file, "\n");
1705     }
1706
1707   if (debug_uses)
1708     /* Instead of resetting the debug insns, we could replace each
1709        debug use in the loop with the sum or product of all expanded
1710        accummulators.  Since we'll only know of all expansions at the
1711        end, we'd have to keep track of which vars_to_expand a debug
1712        insn in the loop references, take note of each copy of the
1713        debug insn during unrolling, and when it's all done, compute
1714        the sum or product of each variable and adjust the original
1715        debug insn and each copy thereof.  What a pain!  */
1716     reset_debug_uses_in_loop (loop, dest, debug_uses);
1717
1718   /* Record the accumulator to expand.  */
1719   ves = XNEW (struct var_to_expand);
1720   ves->insn = insn;
1721   ves->reg = copy_rtx (dest);
1722   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1723   ves->next = NULL;
1724   ves->op = GET_CODE (src);
1725   ves->expansion_count = 0;
1726   ves->reuse_expansion = 0;
1727   ves->accum_pos = accum_pos;
1728   return ves;
1729 }
1730
1731 /* Determine whether there is an induction variable in INSN that
1732    we would like to split during unrolling.
1733
1734    I.e. replace
1735
1736    i = i + 1;
1737    ...
1738    i = i + 1;
1739    ...
1740    i = i + 1;
1741    ...
1742
1743    type chains by
1744
1745    i0 = i + 1
1746    ...
1747    i = i0 + 1
1748    ...
1749    i = i0 + 2
1750    ...
1751
1752    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1753    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1754    pointer to it.  */
1755
1756 static struct iv_to_split *
1757 analyze_iv_to_split_insn (rtx insn)
1758 {
1759   rtx set, dest;
1760   struct rtx_iv iv;
1761   struct iv_to_split *ivts;
1762   bool ok;
1763
1764   /* For now we just split the basic induction variables.  Later this may be
1765      extended for example by selecting also addresses of memory references.  */
1766   set = single_set (insn);
1767   if (!set)
1768     return NULL;
1769
1770   dest = SET_DEST (set);
1771   if (!REG_P (dest))
1772     return NULL;
1773
1774   if (!biv_p (insn, dest))
1775     return NULL;
1776
1777   ok = iv_analyze_result (insn, dest, &iv);
1778
1779   /* This used to be an assert under the assumption that if biv_p returns
1780      true that iv_analyze_result must also return true.  However, that
1781      assumption is not strictly correct as evidenced by pr25569.
1782
1783      Returning NULL when iv_analyze_result returns false is safe and
1784      avoids the problems in pr25569 until the iv_analyze_* routines
1785      can be fixed, which is apparently hard and time consuming
1786      according to their author.  */
1787   if (! ok)
1788     return NULL;
1789
1790   if (iv.step == const0_rtx
1791       || iv.mode != iv.extend_mode)
1792     return NULL;
1793
1794   /* Record the insn to split.  */
1795   ivts = XNEW (struct iv_to_split);
1796   ivts->insn = insn;
1797   ivts->base_var = NULL_RTX;
1798   ivts->step = iv.step;
1799   ivts->next = NULL;
1800   ivts->n_loc = 1;
1801   ivts->loc[0] = 1;
1802
1803   return ivts;
1804 }
1805
1806 /* Determines which of insns in LOOP can be optimized.
1807    Return a OPT_INFO struct with the relevant hash tables filled
1808    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1809    is undefined for the return value.  */
1810
1811 static struct opt_info *
1812 analyze_insns_in_loop (struct loop *loop)
1813 {
1814   basic_block *body, bb;
1815   unsigned i;
1816   struct opt_info *opt_info = XCNEW (struct opt_info);
1817   rtx insn;
1818   struct iv_to_split *ivts = NULL;
1819   struct var_to_expand *ves = NULL;
1820   PTR *slot1;
1821   PTR *slot2;
1822   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1823   edge exit;
1824   bool can_apply = false;
1825
1826   iv_analysis_loop_init (loop);
1827
1828   body = get_loop_body (loop);
1829
1830   if (flag_split_ivs_in_unroller)
1831     {
1832       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1833                                               si_info_hash, si_info_eq, free);
1834       opt_info->iv_to_split_head = NULL;
1835       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1836     }
1837
1838   /* Record the loop exit bb and loop preheader before the unrolling.  */
1839   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1840
1841   if (VEC_length (edge, edges) == 1)
1842     {
1843       exit = VEC_index (edge, edges, 0);
1844       if (!(exit->flags & EDGE_COMPLEX))
1845         {
1846           opt_info->loop_exit = split_edge (exit);
1847           can_apply = true;
1848         }
1849     }
1850
1851   if (flag_variable_expansion_in_unroller
1852       && can_apply)
1853     {
1854       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1855                                                         ve_info_hash,
1856                                                         ve_info_eq, free);
1857       opt_info->var_to_expand_head = NULL;
1858       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1859     }
1860
1861   for (i = 0; i < loop->num_nodes; i++)
1862     {
1863       bb = body[i];
1864       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1865         continue;
1866
1867       FOR_BB_INSNS (bb, insn)
1868       {
1869         if (!INSN_P (insn))
1870           continue;
1871
1872         if (opt_info->insns_to_split)
1873           ivts = analyze_iv_to_split_insn (insn);
1874
1875         if (ivts)
1876           {
1877             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1878             gcc_assert (*slot1 == NULL);
1879             *slot1 = ivts;
1880             *opt_info->iv_to_split_tail = ivts;
1881             opt_info->iv_to_split_tail = &ivts->next;
1882             continue;
1883           }
1884
1885         if (opt_info->insns_with_var_to_expand)
1886           ves = analyze_insn_to_expand_var (loop, insn);
1887
1888         if (ves)
1889           {
1890             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1891             gcc_assert (*slot2 == NULL);
1892             *slot2 = ves;
1893             *opt_info->var_to_expand_tail = ves;
1894             opt_info->var_to_expand_tail = &ves->next;
1895           }
1896       }
1897     }
1898
1899   VEC_free (edge, heap, edges);
1900   free (body);
1901   return opt_info;
1902 }
1903
1904 /* Called just before loop duplication.  Records start of duplicated area
1905    to OPT_INFO.  */
1906
1907 static void
1908 opt_info_start_duplication (struct opt_info *opt_info)
1909 {
1910   if (opt_info)
1911     opt_info->first_new_block = last_basic_block;
1912 }
1913
1914 /* Determine the number of iterations between initialization of the base
1915    variable and the current copy (N_COPY).  N_COPIES is the total number
1916    of newly created copies.  UNROLLING is true if we are unrolling
1917    (not peeling) the loop.  */
1918
1919 static unsigned
1920 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1921 {
1922   if (unrolling)
1923     {
1924       /* If we are unrolling, initialization is done in the original loop
1925          body (number 0).  */
1926       return n_copy;
1927     }
1928   else
1929     {
1930       /* If we are peeling, the copy in that the initialization occurs has
1931          number 1.  The original loop (number 0) is the last.  */
1932       if (n_copy)
1933         return n_copy - 1;
1934       else
1935         return n_copies;
1936     }
1937 }
1938
1939 /* Locate in EXPR the expression corresponding to the location recorded
1940    in IVTS, and return a pointer to the RTX for this location.  */
1941
1942 static rtx *
1943 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1944 {
1945   unsigned i;
1946   rtx *ret = &expr;
1947
1948   for (i = 0; i < ivts->n_loc; i++)
1949     ret = &XEXP (*ret, ivts->loc[i]);
1950
1951   return ret;
1952 }
1953
1954 /* Allocate basic variable for the induction variable chain.  */
1955
1956 static void
1957 allocate_basic_variable (struct iv_to_split *ivts)
1958 {
1959   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1960
1961   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1962 }
1963
1964 /* Insert initialization of basic variable of IVTS before INSN, taking
1965    the initial value from INSN.  */
1966
1967 static void
1968 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1969 {
1970   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1971   rtx seq;
1972
1973   start_sequence ();
1974   expr = force_operand (expr, ivts->base_var);
1975   if (expr != ivts->base_var)
1976     emit_move_insn (ivts->base_var, expr);
1977   seq = get_insns ();
1978   end_sequence ();
1979
1980   emit_insn_before (seq, insn);
1981 }
1982
1983 /* Replace the use of induction variable described in IVTS in INSN
1984    by base variable + DELTA * step.  */
1985
1986 static void
1987 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1988 {
1989   rtx expr, *loc, seq, incr, var;
1990   enum machine_mode mode = GET_MODE (ivts->base_var);
1991   rtx src, dest, set;
1992
1993   /* Construct base + DELTA * step.  */
1994   if (!delta)
1995     expr = ivts->base_var;
1996   else
1997     {
1998       incr = simplify_gen_binary (MULT, mode,
1999                                   ivts->step, gen_int_mode (delta, mode));
2000       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2001                                   ivts->base_var, incr);
2002     }
2003
2004   /* Figure out where to do the replacement.  */
2005   loc = get_ivts_expr (single_set (insn), ivts);
2006
2007   /* If we can make the replacement right away, we're done.  */
2008   if (validate_change (insn, loc, expr, 0))
2009     return;
2010
2011   /* Otherwise, force EXPR into a register and try again.  */
2012   start_sequence ();
2013   var = gen_reg_rtx (mode);
2014   expr = force_operand (expr, var);
2015   if (expr != var)
2016     emit_move_insn (var, expr);
2017   seq = get_insns ();
2018   end_sequence ();
2019   emit_insn_before (seq, insn);
2020
2021   if (validate_change (insn, loc, var, 0))
2022     return;
2023
2024   /* The last chance.  Try recreating the assignment in insn
2025      completely from scratch.  */
2026   set = single_set (insn);
2027   gcc_assert (set);
2028
2029   start_sequence ();
2030   *loc = var;
2031   src = copy_rtx (SET_SRC (set));
2032   dest = copy_rtx (SET_DEST (set));
2033   src = force_operand (src, dest);
2034   if (src != dest)
2035     emit_move_insn (dest, src);
2036   seq = get_insns ();
2037   end_sequence ();
2038
2039   emit_insn_before (seq, insn);
2040   delete_insn (insn);
2041 }
2042
2043
2044 /* Return one expansion of the accumulator recorded in struct VE.  */
2045
2046 static rtx
2047 get_expansion (struct var_to_expand *ve)
2048 {
2049   rtx reg;
2050
2051   if (ve->reuse_expansion == 0)
2052     reg = ve->reg;
2053   else
2054     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2055
2056   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2057     ve->reuse_expansion = 0;
2058   else
2059     ve->reuse_expansion++;
2060
2061   return reg;
2062 }
2063
2064
2065 /* Given INSN replace the uses of the accumulator recorded in VE
2066    with a new register.  */
2067
2068 static void
2069 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2070 {
2071   rtx new_reg, set;
2072   bool really_new_expansion = false;
2073
2074   set = single_set (insn);
2075   gcc_assert (set);
2076
2077   /* Generate a new register only if the expansion limit has not been
2078      reached.  Else reuse an already existing expansion.  */
2079   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2080     {
2081       really_new_expansion = true;
2082       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2083     }
2084   else
2085     new_reg = get_expansion (ve);
2086
2087   validate_change (insn, &SET_DEST (set), new_reg, 1);
2088   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2089
2090   if (apply_change_group ())
2091     if (really_new_expansion)
2092       {
2093         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2094         ve->expansion_count++;
2095       }
2096 }
2097
2098 /* Initialize the variable expansions in loop preheader.  PLACE is the
2099    loop-preheader basic block where the initialization of the
2100    expansions should take place.  The expansions are initialized with
2101    (-0) when the operation is plus or minus to honor sign zero.  This
2102    way we can prevent cases where the sign of the final result is
2103    effected by the sign of the expansion.  Here is an example to
2104    demonstrate this:
2105
2106    for (i = 0 ; i < n; i++)
2107      sum += something;
2108
2109    ==>
2110
2111    sum += something
2112    ....
2113    i = i+1;
2114    sum1 += something
2115    ....
2116    i = i+1
2117    sum2 += something;
2118    ....
2119
2120    When SUM is initialized with -zero and SOMETHING is also -zero; the
2121    final result of sum should be -zero thus the expansions sum1 and sum2
2122    should be initialized with -zero as well (otherwise we will get +zero
2123    as the final result).  */
2124
2125 static void
2126 insert_var_expansion_initialization (struct var_to_expand *ve,
2127                                      basic_block place)
2128 {
2129   rtx seq, var, zero_init, insn;
2130   unsigned i;
2131   enum machine_mode mode = GET_MODE (ve->reg);
2132   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2133
2134   if (VEC_length (rtx, ve->var_expansions) == 0)
2135     return;
2136
2137   start_sequence ();
2138   switch (ve->op)
2139     {
2140     case FMA:
2141       /* Note that we only accumulate FMA via the ADD operand.  */
2142     case PLUS:
2143     case MINUS:
2144       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2145         {
2146           if (honor_signed_zero_p)
2147             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2148           else
2149             zero_init = CONST0_RTX (mode);
2150           emit_move_insn (var, zero_init);
2151         }
2152       break;
2153
2154     case MULT:
2155       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2156         {
2157           zero_init = CONST1_RTX (GET_MODE (var));
2158           emit_move_insn (var, zero_init);
2159         }
2160       break;
2161
2162     default:
2163       gcc_unreachable ();
2164     }
2165
2166   seq = get_insns ();
2167   end_sequence ();
2168
2169   insn = BB_HEAD (place);
2170   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2171     insn = NEXT_INSN (insn);
2172
2173   emit_insn_after (seq, insn);
2174 }
2175
2176 /* Combine the variable expansions at the loop exit.  PLACE is the
2177    loop exit basic block where the summation of the expansions should
2178    take place.  */
2179
2180 static void
2181 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2182 {
2183   rtx sum = ve->reg;
2184   rtx expr, seq, var, insn;
2185   unsigned i;
2186
2187   if (VEC_length (rtx, ve->var_expansions) == 0)
2188     return;
2189
2190   start_sequence ();
2191   switch (ve->op)
2192     {
2193     case FMA:
2194       /* Note that we only accumulate FMA via the ADD operand.  */
2195     case PLUS:
2196     case MINUS:
2197       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2198         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2199       break;
2200
2201     case MULT:
2202       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2203         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2204       break;
2205
2206     default:
2207       gcc_unreachable ();
2208     }
2209
2210   expr = force_operand (sum, ve->reg);
2211   if (expr != ve->reg)
2212     emit_move_insn (ve->reg, expr);
2213   seq = get_insns ();
2214   end_sequence ();
2215
2216   insn = BB_HEAD (place);
2217   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2218     insn = NEXT_INSN (insn);
2219
2220   emit_insn_after (seq, insn);
2221 }
2222
2223 /* Apply loop optimizations in loop copies using the
2224    data which gathered during the unrolling.  Structure
2225    OPT_INFO record that data.
2226
2227    UNROLLING is true if we unrolled (not peeled) the loop.
2228    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2229    the loop (as it should happen in complete unrolling, but not in ordinary
2230    peeling of the loop).  */
2231
2232 static void
2233 apply_opt_in_copies (struct opt_info *opt_info,
2234                      unsigned n_copies, bool unrolling,
2235                      bool rewrite_original_loop)
2236 {
2237   unsigned i, delta;
2238   basic_block bb, orig_bb;
2239   rtx insn, orig_insn, next;
2240   struct iv_to_split ivts_templ, *ivts;
2241   struct var_to_expand ve_templ, *ves;
2242
2243   /* Sanity check -- we need to put initialization in the original loop
2244      body.  */
2245   gcc_assert (!unrolling || rewrite_original_loop);
2246
2247   /* Allocate the basic variables (i0).  */
2248   if (opt_info->insns_to_split)
2249     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2250       allocate_basic_variable (ivts);
2251
2252   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2253     {
2254       bb = BASIC_BLOCK (i);
2255       orig_bb = get_bb_original (bb);
2256
2257       /* bb->aux holds position in copy sequence initialized by
2258          duplicate_loop_to_header_edge.  */
2259       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2260                                         unrolling);
2261       bb->aux = 0;
2262       orig_insn = BB_HEAD (orig_bb);
2263       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2264         {
2265           next = NEXT_INSN (insn);
2266           if (!INSN_P (insn)
2267               || (DEBUG_INSN_P (insn)
2268                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2269             continue;
2270
2271           while (!INSN_P (orig_insn)
2272                  || (DEBUG_INSN_P (orig_insn)
2273                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2274                          == LABEL_DECL)))
2275             orig_insn = NEXT_INSN (orig_insn);
2276
2277           ivts_templ.insn = orig_insn;
2278           ve_templ.insn = orig_insn;
2279
2280           /* Apply splitting iv optimization.  */
2281           if (opt_info->insns_to_split)
2282             {
2283               ivts = (struct iv_to_split *)
2284                 htab_find (opt_info->insns_to_split, &ivts_templ);
2285
2286               if (ivts)
2287                 {
2288                   gcc_assert (GET_CODE (PATTERN (insn))
2289                               == GET_CODE (PATTERN (orig_insn)));
2290
2291                   if (!delta)
2292                     insert_base_initialization (ivts, insn);
2293                   split_iv (ivts, insn, delta);
2294                 }
2295             }
2296           /* Apply variable expansion optimization.  */
2297           if (unrolling && opt_info->insns_with_var_to_expand)
2298             {
2299               ves = (struct var_to_expand *)
2300                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2301               if (ves)
2302                 {
2303                   gcc_assert (GET_CODE (PATTERN (insn))
2304                               == GET_CODE (PATTERN (orig_insn)));
2305                   expand_var_during_unrolling (ves, insn);
2306                 }
2307             }
2308           orig_insn = NEXT_INSN (orig_insn);
2309         }
2310     }
2311
2312   if (!rewrite_original_loop)
2313     return;
2314
2315   /* Initialize the variable expansions in the loop preheader
2316      and take care of combining them at the loop exit.  */
2317   if (opt_info->insns_with_var_to_expand)
2318     {
2319       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2320         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2321       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2322         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2323     }
2324
2325   /* Rewrite also the original loop body.  Find them as originals of the blocks
2326      in the last copied iteration, i.e. those that have
2327      get_bb_copy (get_bb_original (bb)) == bb.  */
2328   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2329     {
2330       bb = BASIC_BLOCK (i);
2331       orig_bb = get_bb_original (bb);
2332       if (get_bb_copy (orig_bb) != bb)
2333         continue;
2334
2335       delta = determine_split_iv_delta (0, n_copies, unrolling);
2336       for (orig_insn = BB_HEAD (orig_bb);
2337            orig_insn != NEXT_INSN (BB_END (bb));
2338            orig_insn = next)
2339         {
2340           next = NEXT_INSN (orig_insn);
2341
2342           if (!INSN_P (orig_insn))
2343             continue;
2344
2345           ivts_templ.insn = orig_insn;
2346           if (opt_info->insns_to_split)
2347             {
2348               ivts = (struct iv_to_split *)
2349                 htab_find (opt_info->insns_to_split, &ivts_templ);
2350               if (ivts)
2351                 {
2352                   if (!delta)
2353                     insert_base_initialization (ivts, orig_insn);
2354                   split_iv (ivts, orig_insn, delta);
2355                   continue;
2356                 }
2357             }
2358
2359         }
2360     }
2361 }
2362
2363 /* Release OPT_INFO.  */
2364
2365 static void
2366 free_opt_info (struct opt_info *opt_info)
2367 {
2368   if (opt_info->insns_to_split)
2369     htab_delete (opt_info->insns_to_split);
2370   if (opt_info->insns_with_var_to_expand)
2371     {
2372       struct var_to_expand *ves;
2373
2374       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2375         VEC_free (rtx, heap, ves->var_expansions);
2376       htab_delete (opt_info->insns_with_var_to_expand);
2377     }
2378   free (opt_info);
2379 }