gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "hard-reg-set.h"
  26 #include "obstack.h"
  27 #include "basic-block.h"
  28 #include "cfgloop.h"
  29 #include "params.h"
  30 #include "expr.h"
  31 #include "hash-table.h"
  32 #include "recog.h"
  33 #include "target.h"
  34 #include "dumpfile.h"
  35
  36 /* This pass performs loop unrolling and peeling.  We only perform these
  37    optimizations on innermost loops (with single exception) because
  38    the impact on performance is greatest here, and we want to avoid
  39    unnecessary code size growth.  The gain is caused by greater sequentiality
  40    of code, better code to optimize for further passes and in some cases
  41    by fewer testings of exit conditions.  The main problem is code growth,
  42    that impacts performance negatively due to effect of caches.
  43
  44    What we do:
  45
  46    -- complete peeling of once-rolling loops; this is the above mentioned
  47       exception, as this causes loop to be cancelled completely and
  48       does not cause code growth
  49    -- complete peeling of loops that roll (small) constant times.
  50    -- simple peeling of first iterations of loops that do not roll much
  51       (according to profile feedback)
  52    -- unrolling of loops that roll constant times; this is almost always
  53       win, as we get rid of exit condition tests.
  54    -- unrolling of loops that roll number of times that we can compute
  55       in runtime; we also get rid of exit condition tests here, but there
  56       is the extra expense for calculating the number of iterations
  57    -- simple unrolling of remaining loops; this is performed only if we
  58       are asked to, as the gain is questionable in this case and often
  59       it may even slow down the code
  60    For more detailed descriptions of each of those, see comments at
  61    appropriate function below.
  62
  63    There is a lot of parameters (defined and described in params.def) that
  64    control how much we unroll/peel.
  65
  66    ??? A great problem is that we don't have a good way how to determine
  67    how many times we should unroll the loop; the experiments I have made
  68    showed that this choice may affect performance in order of several %.
  69    */
  70
  71 /* Information about induction variables to split.  */
  72
  73 struct iv_to_split
  74 {
  75   rtx insn;             /* The insn in that the induction variable occurs.  */
  76   rtx orig_var;         /* The variable (register) for the IV before split.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Hashtable helper for iv_to_split.  */
 106
 107 struct iv_split_hasher : typed_free_remove <iv_to_split>
 108 {
 109   typedef iv_to_split value_type;
 110   typedef iv_to_split compare_type;
 111   static inline hashval_t hash (const value_type *);
 112   static inline bool equal (const value_type *, const compare_type *);
 113 };
 114
 115
 116 /* A hash function for information about insns to split.  */
 117
 118 inline hashval_t
 119 iv_split_hasher::hash (const value_type *ivts)
 120 {
 121   return (hashval_t) INSN_UID (ivts->insn);
 122 }
 123
 124 /* An equality functions for information about insns to split.  */
 125
 126 inline bool
 127 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 128 {
 129   return i1->insn == i2->insn;
 130 }
 131
 132 /* Hashtable helper for iv_to_split.  */
 133
 134 struct var_expand_hasher : typed_free_remove <var_to_expand>
 135 {
 136   typedef var_to_expand value_type;
 137   typedef var_to_expand compare_type;
 138   static inline hashval_t hash (const value_type *);
 139   static inline bool equal (const value_type *, const compare_type *);
 140 };
 141
 142 /* Return a hash for VES.  */
 143
 144 inline hashval_t
 145 var_expand_hasher::hash (const value_type *ves)
 146 {
 147   return (hashval_t) INSN_UID (ves->insn);
 148 }
 149
 150 /* Return true if I1 and I2 refer to the same instruction.  */
 151
 152 inline bool
 153 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 154 {
 155   return i1->insn == i2->insn;
 156 }
 157
 158 /* Information about optimization applied in
 159    the unrolled loop.  */
 160
 161 struct opt_info
 162 {
 163   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 164                                                   split.  */
 165   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 166   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 167   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 168                                         insns with accumulators to expand.  */
 169   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 170   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 171   unsigned first_new_block;        /* The first basic block that was
 172                                       duplicated.  */
 173   basic_block loop_exit;           /* The loop exit basic block.  */
 174   basic_block loop_preheader;      /* The loop preheader basic block.  */
 175 };
 176
 177 static void decide_unrolling_and_peeling (int);
 178 static void peel_loops_completely (int);
 179 static void decide_peel_simple (struct loop *, int);
 180 static void decide_peel_once_rolling (struct loop *, int);
 181 static void decide_peel_completely (struct loop *, int);
 182 static void decide_unroll_stupid (struct loop *, int);
 183 static void decide_unroll_constant_iterations (struct loop *, int);
 184 static void decide_unroll_runtime_iterations (struct loop *, int);
 185 static void peel_loop_simple (struct loop *);
 186 static void peel_loop_completely (struct loop *);
 187 static void unroll_loop_stupid (struct loop *);
 188 static void unroll_loop_constant_iterations (struct loop *);
 189 static void unroll_loop_runtime_iterations (struct loop *);
 190 static struct opt_info *analyze_insns_in_loop (struct loop *);
 191 static void opt_info_start_duplication (struct opt_info *);
 192 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 193 static void free_opt_info (struct opt_info *);
 194 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 195 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 196 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 197 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 198 static void insert_var_expansion_initialization (struct var_to_expand *,
 199                                                  basic_block);
 200 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 201                                              basic_block);
 202 static rtx get_expansion (struct var_to_expand *);
 203
 204 /* Emit a message summarizing the unroll or peel that will be
 205    performed for LOOP, along with the loop's location LOCUS, if
 206    appropriate given the dump or -fopt-info settings.  */
 207
 208 static void
 209 report_unroll_peel (struct loop *loop, location_t locus)
 210 {
 211   struct niter_desc *desc;
 212   int niters = 0;
 213   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 214
 215   if (!dump_enabled_p ())
 216     return;
 217
 218   /* In the special case where the loop never iterated, emit
 219      a different message so that we don't report an unroll by 0.
 220      This matches the equivalent message emitted during tree unrolling.  */
 221   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 222       && !loop->lpt_decision.times)
 223     {
 224       dump_printf_loc (report_flags, locus,
 225                        "Turned loop into non-loop; it never loops.\n");
 226       return;
 227     }
 228
 229   desc = get_simple_loop_desc (loop);
 230
 231   if (desc->const_iter)
 232     niters = desc->niter;
 233   else if (loop->header->count)
 234     niters = expected_loop_iterations (loop);
 235
 236   dump_printf_loc (report_flags, locus,
 237                    "%s loop %d times",
 238                    (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 239                     ?  "Completely unroll"
 240                     : (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 241                        ? "Peel" : "Unroll")),
 242                    loop->lpt_decision.times);
 243   if (profile_info)
 244     dump_printf (report_flags,
 245                  " (header execution count %d",
 246                  (int)loop->header->count);
 247   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 248     dump_printf (report_flags,
 249                  "%s%s iterations %d)",
 250                  profile_info ? ", " : " (",
 251                  desc->const_iter ? "const" : "average",
 252                  niters);
 253   else if (profile_info)
 254     dump_printf (report_flags, ")");
 255
 256   dump_printf (report_flags, "\n");
 257 }
 258
 259 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 260 void
 261 unroll_and_peel_loops (int flags)
 262 {
 263   struct loop *loop;
 264   bool changed = false;
 265   loop_iterator li;
 266
 267   /* First perform complete loop peeling (it is almost surely a win,
 268      and affects parameters for further decision a lot).  */
 269   peel_loops_completely (flags);
 270
 271   /* Now decide rest of unrolling and peeling.  */
 272   decide_unrolling_and_peeling (flags);
 273
 274   /* Scan the loops, inner ones first.  */
 275   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 276     {
 277       /* And perform the appropriate transformations.  */
 278       switch (loop->lpt_decision.decision)
 279         {
 280         case LPT_PEEL_COMPLETELY:
 281           /* Already done.  */
 282           gcc_unreachable ();
 283         case LPT_PEEL_SIMPLE:
 284           peel_loop_simple (loop);
 285           changed = true;
 286           break;
 287         case LPT_UNROLL_CONSTANT:
 288           unroll_loop_constant_iterations (loop);
 289           changed = true;
 290           break;
 291         case LPT_UNROLL_RUNTIME:
 292           unroll_loop_runtime_iterations (loop);
 293           changed = true;
 294           break;
 295         case LPT_UNROLL_STUPID:
 296           unroll_loop_stupid (loop);
 297           changed = true;
 298           break;
 299         case LPT_NONE:
 300           break;
 301         default:
 302           gcc_unreachable ();
 303         }
 304     }
 305
 306     if (changed)
 307       {
 308         calculate_dominance_info (CDI_DOMINATORS);
 309         fix_loop_structure (NULL);
 310       }
 311
 312   iv_analysis_done ();
 313 }
 314
 315 /* Check whether exit of the LOOP is at the end of loop body.  */
 316
 317 static bool
 318 loop_exit_at_end_p (struct loop *loop)
 319 {
 320   struct niter_desc *desc = get_simple_loop_desc (loop);
 321   rtx insn;
 322
 323   if (desc->in_edge->dest != loop->latch)
 324     return false;
 325
 326   /* Check that the latch is empty.  */
 327   FOR_BB_INSNS (loop->latch, insn)
 328     {
 329       if (NONDEBUG_INSN_P (insn))
 330         return false;
 331     }
 332
 333   return true;
 334 }
 335
 336 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 337 static void
 338 peel_loops_completely (int flags)
 339 {
 340   struct loop *loop;
 341   loop_iterator li;
 342   bool changed = false;
 343
 344   /* Scan the loops, the inner ones first.  */
 345   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 346     {
 347       loop->lpt_decision.decision = LPT_NONE;
 348       location_t locus = get_loop_location (loop);
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (TDF_RTL, locus,
 352                          ";; *** Considering loop %d at BB %d for "
 353                          "complete peeling ***\n",
 354                          loop->num, loop->header->index);
 355
 356       loop->ninsns = num_loop_insns (loop);
 357
 358       decide_peel_once_rolling (loop, flags);
 359       if (loop->lpt_decision.decision == LPT_NONE)
 360         decide_peel_completely (loop, flags);
 361
 362       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 363         {
 364           report_unroll_peel (loop, locus);
 365           peel_loop_completely (loop);
 366           changed = true;
 367         }
 368     }
 369
 370     if (changed)
 371       {
 372         calculate_dominance_info (CDI_DOMINATORS);
 373         fix_loop_structure (NULL);
 374       }
 375 }
 376
 377 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 378 static void
 379 decide_unrolling_and_peeling (int flags)
 380 {
 381   struct loop *loop;
 382   loop_iterator li;
 383
 384   /* Scan the loops, inner ones first.  */
 385   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 386     {
 387       loop->lpt_decision.decision = LPT_NONE;
 388       location_t locus = get_loop_location (loop);
 389
 390       if (dump_enabled_p ())
 391         dump_printf_loc (TDF_RTL, locus,
 392                          ";; *** Considering loop %d at BB %d for "
 393                          "unrolling and peeling ***\n",
 394                          loop->num, loop->header->index);
 395
 396       /* Do not peel cold areas.  */
 397       if (optimize_loop_for_size_p (loop))
 398         {
 399           if (dump_file)
 400             fprintf (dump_file, ";; Not considering loop, cold area\n");
 401           continue;
 402         }
 403
 404       /* Can the loop be manipulated?  */
 405       if (!can_duplicate_loop_p (loop))
 406         {
 407           if (dump_file)
 408             fprintf (dump_file,
 409                      ";; Not considering loop, cannot duplicate\n");
 410           continue;
 411         }
 412
 413       /* Skip non-innermost loops.  */
 414       if (loop->inner)
 415         {
 416           if (dump_file)
 417             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 418           continue;
 419         }
 420
 421       loop->ninsns = num_loop_insns (loop);
 422       loop->av_ninsns = average_num_loop_insns (loop);
 423
 424       /* Try transformations one by one in decreasing order of
 425          priority.  */
 426
 427       decide_unroll_constant_iterations (loop, flags);
 428       if (loop->lpt_decision.decision == LPT_NONE)
 429         decide_unroll_runtime_iterations (loop, flags);
 430       if (loop->lpt_decision.decision == LPT_NONE)
 431         decide_unroll_stupid (loop, flags);
 432       if (loop->lpt_decision.decision == LPT_NONE)
 433         decide_peel_simple (loop, flags);
 434
 435       report_unroll_peel (loop, locus);
 436     }
 437 }
 438
 439 /* Decide whether the LOOP is once rolling and suitable for complete
 440    peeling.  */
 441 static void
 442 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 443 {
 444   struct niter_desc *desc;
 445
 446   if (dump_file)
 447     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 448
 449   /* Is the loop small enough?  */
 450   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 451     {
 452       if (dump_file)
 453         fprintf (dump_file, ";; Not considering loop, is too big\n");
 454       return;
 455     }
 456
 457   /* Check for simple loops.  */
 458   desc = get_simple_loop_desc (loop);
 459
 460   /* Check number of iterations.  */
 461   if (!desc->simple_p
 462       || desc->assumptions
 463       || desc->infinite
 464       || !desc->const_iter
 465       || (desc->niter != 0
 466           && max_loop_iterations_int (loop) != 0))
 467     {
 468       if (dump_file)
 469         fprintf (dump_file,
 470                  ";; Unable to prove that the loop rolls exactly once\n");
 471       return;
 472     }
 473
 474   /* Success.  */
 475   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 476 }
 477
 478 /* Decide whether the LOOP is suitable for complete peeling.  */
 479 static void
 480 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 481 {
 482   unsigned npeel;
 483   struct niter_desc *desc;
 484
 485   if (dump_file)
 486     fprintf (dump_file, "\n;; Considering peeling completely\n");
 487
 488   /* Skip non-innermost loops.  */
 489   if (loop->inner)
 490     {
 491       if (dump_file)
 492         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 493       return;
 494     }
 495
 496   /* Do not peel cold areas.  */
 497   if (optimize_loop_for_size_p (loop))
 498     {
 499       if (dump_file)
 500         fprintf (dump_file, ";; Not considering loop, cold area\n");
 501       return;
 502     }
 503
 504   /* Can the loop be manipulated?  */
 505   if (!can_duplicate_loop_p (loop))
 506     {
 507       if (dump_file)
 508         fprintf (dump_file,
 509                  ";; Not considering loop, cannot duplicate\n");
 510       return;
 511     }
 512
 513   /* npeel = number of iterations to peel.  */
 514   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 515   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 516     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 517
 518   /* Is the loop small enough?  */
 519   if (!npeel)
 520     {
 521       if (dump_file)
 522         fprintf (dump_file, ";; Not considering loop, is too big\n");
 523       return;
 524     }
 525
 526   /* Check for simple loops.  */
 527   desc = get_simple_loop_desc (loop);
 528
 529   /* Check number of iterations.  */
 530   if (!desc->simple_p
 531       || desc->assumptions
 532       || !desc->const_iter
 533       || desc->infinite)
 534     {
 535       if (dump_file)
 536         fprintf (dump_file,
 537                  ";; Unable to prove that the loop iterates constant times\n");
 538       return;
 539     }
 540
 541   if (desc->niter > npeel - 1)
 542     {
 543       if (dump_file)
 544         {
 545           fprintf (dump_file,
 546                    ";; Not peeling loop completely, rolls too much (");
 547           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 548           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 549         }
 550       return;
 551     }
 552
 553   /* Success.  */
 554   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 555 }
 556
 557 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 558    completely.  The transformation done:
 559
 560    for (i = 0; i < 4; i++)
 561      body;
 562
 563    ==>
 564
 565    i = 0;
 566    body; i++;
 567    body; i++;
 568    body; i++;
 569    body; i++;
 570    */
 571 static void
 572 peel_loop_completely (struct loop *loop)
 573 {
 574   sbitmap wont_exit;
 575   unsigned HOST_WIDE_INT npeel;
 576   unsigned i;
 577   vec<edge> remove_edges;
 578   edge ein;
 579   struct niter_desc *desc = get_simple_loop_desc (loop);
 580   struct opt_info *opt_info = NULL;
 581
 582   npeel = desc->niter;
 583
 584   if (npeel)
 585     {
 586       bool ok;
 587
 588       wont_exit = sbitmap_alloc (npeel + 1);
 589       bitmap_ones (wont_exit);
 590       bitmap_clear_bit (wont_exit, 0);
 591       if (desc->noloop_assumptions)
 592         bitmap_clear_bit (wont_exit, 1);
 593
 594       remove_edges.create (0);
 595
 596       if (flag_split_ivs_in_unroller)
 597         opt_info = analyze_insns_in_loop (loop);
 598
 599       opt_info_start_duplication (opt_info);
 600       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 601                                           npeel,
 602                                           wont_exit, desc->out_edge,
 603                                           &remove_edges,
 604                                           DLTHE_FLAG_UPDATE_FREQ
 605                                           | DLTHE_FLAG_COMPLETTE_PEEL
 606                                           | (opt_info
 607                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 608       gcc_assert (ok);
 609
 610       free (wont_exit);
 611
 612       if (opt_info)
 613         {
 614           apply_opt_in_copies (opt_info, npeel, false, true);
 615           free_opt_info (opt_info);
 616         }
 617
 618       /* Remove the exit edges.  */
 619       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 620         remove_path (ein);
 621       remove_edges.release ();
 622     }
 623
 624   ein = desc->in_edge;
 625   free_simple_loop_desc (loop);
 626
 627   /* Now remove the unreachable part of the last iteration and cancel
 628      the loop.  */
 629   remove_path (ein);
 630
 631   if (dump_file)
 632     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 633 }
 634
 635 /* Decide whether to unroll LOOP iterating constant number of times
 636    and how much.  */
 637
 638 static void
 639 decide_unroll_constant_iterations (struct loop *loop, int flags)
 640 {
 641   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 642   struct niter_desc *desc;
 643   double_int iterations;
 644
 645   if (!(flags & UAP_UNROLL))
 646     {
 647       /* We were not asked to, just return back silently.  */
 648       return;
 649     }
 650
 651   if (dump_file)
 652     fprintf (dump_file,
 653              "\n;; Considering unrolling loop with constant "
 654              "number of iterations\n");
 655
 656   /* nunroll = total number of copies of the original loop body in
 657      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 658   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 659   nunroll_by_av
 660     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 661   if (nunroll > nunroll_by_av)
 662     nunroll = nunroll_by_av;
 663   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 664     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 665
 666   /* Skip big loops.  */
 667   if (nunroll <= 1)
 668     {
 669       if (dump_file)
 670         fprintf (dump_file, ";; Not considering loop, is too big\n");
 671       return;
 672     }
 673
 674   /* Check for simple loops.  */
 675   desc = get_simple_loop_desc (loop);
 676
 677   /* Check number of iterations.  */
 678   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 679     {
 680       if (dump_file)
 681         fprintf (dump_file,
 682                  ";; Unable to prove that the loop iterates constant times\n");
 683       return;
 684     }
 685
 686   /* Check whether the loop rolls enough to consider.
 687      Consult also loop bounds and profile; in the case the loop has more
 688      than one exit it may well loop less than determined maximal number
 689      of iterations.  */
 690   if (desc->niter < 2 * nunroll
 691       || ((estimated_loop_iterations (loop, &iterations)
 692            || max_loop_iterations (loop, &iterations))
 693           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 694     {
 695       if (dump_file)
 696         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 697       return;
 698     }
 699
 700   /* Success; now compute number of iterations to unroll.  We alter
 701      nunroll so that as few as possible copies of loop body are
 702      necessary, while still not decreasing the number of unrollings
 703      too much (at most by 1).  */
 704   best_copies = 2 * nunroll + 10;
 705
 706   i = 2 * nunroll + 2;
 707   if (i - 1 >= desc->niter)
 708     i = desc->niter - 2;
 709
 710   for (; i >= nunroll - 1; i--)
 711     {
 712       unsigned exit_mod = desc->niter % (i + 1);
 713
 714       if (!loop_exit_at_end_p (loop))
 715         n_copies = exit_mod + i + 1;
 716       else if (exit_mod != (unsigned) i
 717                || desc->noloop_assumptions != NULL_RTX)
 718         n_copies = exit_mod + i + 2;
 719       else
 720         n_copies = i + 1;
 721
 722       if (n_copies < best_copies)
 723         {
 724           best_copies = n_copies;
 725           best_unroll = i;
 726         }
 727     }
 728
 729   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 730   loop->lpt_decision.times = best_unroll;
 731 }
 732
 733 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 734    The transformation does this:
 735
 736    for (i = 0; i < 102; i++)
 737      body;
 738
 739    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 740
 741    i = 0;
 742    body; i++;
 743    body; i++;
 744    while (i < 102)
 745      {
 746        body; i++;
 747        body; i++;
 748        body; i++;
 749        body; i++;
 750      }
 751   */
 752 static void
 753 unroll_loop_constant_iterations (struct loop *loop)
 754 {
 755   unsigned HOST_WIDE_INT niter;
 756   unsigned exit_mod;
 757   sbitmap wont_exit;
 758   unsigned i;
 759   vec<edge> remove_edges;
 760   edge e;
 761   unsigned max_unroll = loop->lpt_decision.times;
 762   struct niter_desc *desc = get_simple_loop_desc (loop);
 763   bool exit_at_end = loop_exit_at_end_p (loop);
 764   struct opt_info *opt_info = NULL;
 765   bool ok;
 766
 767   niter = desc->niter;
 768
 769   /* Should not get here (such loop should be peeled instead).  */
 770   gcc_assert (niter > max_unroll + 1);
 771
 772   exit_mod = niter % (max_unroll + 1);
 773
 774   wont_exit = sbitmap_alloc (max_unroll + 1);
 775   bitmap_ones (wont_exit);
 776
 777   remove_edges.create (0);
 778   if (flag_split_ivs_in_unroller
 779       || flag_variable_expansion_in_unroller)
 780     opt_info = analyze_insns_in_loop (loop);
 781
 782   if (!exit_at_end)
 783     {
 784       /* The exit is not at the end of the loop; leave exit test
 785          in the first copy, so that the loops that start with test
 786          of exit condition have continuous body after unrolling.  */
 787
 788       if (dump_file)
 789         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 790
 791       /* Peel exit_mod iterations.  */
 792       bitmap_clear_bit (wont_exit, 0);
 793       if (desc->noloop_assumptions)
 794         bitmap_clear_bit (wont_exit, 1);
 795
 796       if (exit_mod)
 797         {
 798           opt_info_start_duplication (opt_info);
 799           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 800                                               exit_mod,
 801                                               wont_exit, desc->out_edge,
 802                                               &remove_edges,
 803                                               DLTHE_FLAG_UPDATE_FREQ
 804                                               | (opt_info && exit_mod > 1
 805                                                  ? DLTHE_RECORD_COPY_NUMBER
 806                                                    : 0));
 807           gcc_assert (ok);
 808
 809           if (opt_info && exit_mod > 1)
 810             apply_opt_in_copies (opt_info, exit_mod, false, false);
 811
 812           desc->noloop_assumptions = NULL_RTX;
 813           desc->niter -= exit_mod;
 814           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 815           if (loop->any_estimate
 816               && double_int::from_uhwi (exit_mod).ule
 817                    (loop->nb_iterations_estimate))
 818             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 819           else
 820             loop->any_estimate = false;
 821         }
 822
 823       bitmap_set_bit (wont_exit, 1);
 824     }
 825   else
 826     {
 827       /* Leave exit test in last copy, for the same reason as above if
 828          the loop tests the condition at the end of loop body.  */
 829
 830       if (dump_file)
 831         fprintf (dump_file, ";; Condition at end of loop.\n");
 832
 833       /* We know that niter >= max_unroll + 2; so we do not need to care of
 834          case when we would exit before reaching the loop.  So just peel
 835          exit_mod + 1 iterations.  */
 836       if (exit_mod != max_unroll
 837           || desc->noloop_assumptions)
 838         {
 839           bitmap_clear_bit (wont_exit, 0);
 840           if (desc->noloop_assumptions)
 841             bitmap_clear_bit (wont_exit, 1);
 842
 843           opt_info_start_duplication (opt_info);
 844           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 845                                               exit_mod + 1,
 846                                               wont_exit, desc->out_edge,
 847                                               &remove_edges,
 848                                               DLTHE_FLAG_UPDATE_FREQ
 849                                               | (opt_info && exit_mod > 0
 850                                                  ? DLTHE_RECORD_COPY_NUMBER
 851                                                    : 0));
 852           gcc_assert (ok);
 853
 854           if (opt_info && exit_mod > 0)
 855             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 856
 857           desc->niter -= exit_mod + 1;
 858           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 859           if (loop->any_estimate
 860               && double_int::from_uhwi (exit_mod + 1).ule
 861                    (loop->nb_iterations_estimate))
 862             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 863           else
 864             loop->any_estimate = false;
 865           desc->noloop_assumptions = NULL_RTX;
 866
 867           bitmap_set_bit (wont_exit, 0);
 868           bitmap_set_bit (wont_exit, 1);
 869         }
 870
 871       bitmap_clear_bit (wont_exit, max_unroll);
 872     }
 873
 874   /* Now unroll the loop.  */
 875
 876   opt_info_start_duplication (opt_info);
 877   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 878                                       max_unroll,
 879                                       wont_exit, desc->out_edge,
 880                                       &remove_edges,
 881                                       DLTHE_FLAG_UPDATE_FREQ
 882                                       | (opt_info
 883                                          ? DLTHE_RECORD_COPY_NUMBER
 884                                            : 0));
 885   gcc_assert (ok);
 886
 887   if (opt_info)
 888     {
 889       apply_opt_in_copies (opt_info, max_unroll, true, true);
 890       free_opt_info (opt_info);
 891     }
 892
 893   free (wont_exit);
 894
 895   if (exit_at_end)
 896     {
 897       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 898       /* Find a new in and out edge; they are in the last copy we have made.  */
 899
 900       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 901         {
 902           desc->out_edge = EDGE_SUCC (exit_block, 0);
 903           desc->in_edge = EDGE_SUCC (exit_block, 1);
 904         }
 905       else
 906         {
 907           desc->out_edge = EDGE_SUCC (exit_block, 1);
 908           desc->in_edge = EDGE_SUCC (exit_block, 0);
 909         }
 910     }
 911
 912   desc->niter /= max_unroll + 1;
 913   loop->nb_iterations_upper_bound
 914     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 915                                                                    + 1),
 916                                             TRUNC_DIV_EXPR);
 917   if (loop->any_estimate)
 918     loop->nb_iterations_estimate
 919       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 920                                                                   + 1),
 921                                            TRUNC_DIV_EXPR);
 922   desc->niter_expr = GEN_INT (desc->niter);
 923
 924   /* Remove the edges.  */
 925   FOR_EACH_VEC_ELT (remove_edges, i, e)
 926     remove_path (e);
 927   remove_edges.release ();
 928
 929   if (dump_file)
 930     fprintf (dump_file,
 931              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 932              max_unroll, num_loop_insns (loop));
 933 }
 934
 935 /* Decide whether to unroll LOOP iterating runtime computable number of times
 936    and how much.  */
 937 static void
 938 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 939 {
 940   unsigned nunroll, nunroll_by_av, i;
 941   struct niter_desc *desc;
 942   double_int iterations;
 943
 944   if (!(flags & UAP_UNROLL))
 945     {
 946       /* We were not asked to, just return back silently.  */
 947       return;
 948     }
 949
 950   if (dump_file)
 951     fprintf (dump_file,
 952              "\n;; Considering unrolling loop with runtime "
 953              "computable number of iterations\n");
 954
 955   /* nunroll = total number of copies of the original loop body in
 956      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 957   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 958   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 959   if (nunroll > nunroll_by_av)
 960     nunroll = nunroll_by_av;
 961   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 962     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 963
 964   if (targetm.loop_unroll_adjust)
 965     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 966
 967   /* Skip big loops.  */
 968   if (nunroll <= 1)
 969     {
 970       if (dump_file)
 971         fprintf (dump_file, ";; Not considering loop, is too big\n");
 972       return;
 973     }
 974
 975   /* Check for simple loops.  */
 976   desc = get_simple_loop_desc (loop);
 977
 978   /* Check simpleness.  */
 979   if (!desc->simple_p || desc->assumptions)
 980     {
 981       if (dump_file)
 982         fprintf (dump_file,
 983                  ";; Unable to prove that the number of iterations "
 984                  "can be counted in runtime\n");
 985       return;
 986     }
 987
 988   if (desc->const_iter)
 989     {
 990       if (dump_file)
 991         fprintf (dump_file, ";; Loop iterates constant times\n");
 992       return;
 993     }
 994
 995   /* Check whether the loop rolls.  */
 996   if ((estimated_loop_iterations (loop, &iterations)
 997        || max_loop_iterations (loop, &iterations))
 998       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 999     {
1000       if (dump_file)
1001         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1002       return;
1003     }
1004
1005   /* Success; now force nunroll to be power of 2, as we are unable to
1006      cope with overflows in computation of number of iterations.  */
1007   for (i = 1; 2 * i <= nunroll; i *= 2)
1008     continue;
1009
1010   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1011   loop->lpt_decision.times = i - 1;
1012 }
1013
1014 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1015    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1016    and NULL is returned instead.  */
1017
1018 basic_block
1019 split_edge_and_insert (edge e, rtx insns)
1020 {
1021   basic_block bb;
1022
1023   if (!insns)
1024     return NULL;
1025   bb = split_edge (e);
1026   emit_insn_after (insns, BB_END (bb));
1027
1028   /* ??? We used to assume that INSNS can contain control flow insns, and
1029      that we had to try to find sub basic blocks in BB to maintain a valid
1030      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1031      and call break_superblocks when going out of cfglayout mode.  But it
1032      turns out that this never happens; and that if it does ever happen,
1033      the TODO_verify_flow at the end of the RTL loop passes would fail.
1034
1035      There are two reasons why we expected we could have control flow insns
1036      in INSNS.  The first is when a comparison has to be done in parts, and
1037      the second is when the number of iterations is computed for loops with
1038      the number of iterations known at runtime.  In both cases, test cases
1039      to get control flow in INSNS appear to be impossible to construct:
1040
1041       * If do_compare_rtx_and_jump needs several branches to do comparison
1042         in a mode that needs comparison by parts, we cannot analyze the
1043         number of iterations of the loop, and we never get to unrolling it.
1044
1045       * The code in expand_divmod that was suspected to cause creation of
1046         branching code seems to be only accessed for signed division.  The
1047         divisions used by # of iterations analysis are always unsigned.
1048         Problems might arise on architectures that emits branching code
1049         for some operations that may appear in the unroller (especially
1050         for division), but we have no such architectures.
1051
1052      Considering all this, it was decided that we should for now assume
1053      that INSNS can in theory contain control flow insns, but in practice
1054      it never does.  So we don't handle the theoretical case, and should
1055      a real failure ever show up, we have a pretty good clue for how to
1056      fix it.  */
1057
1058   return bb;
1059 }
1060
1061 /* Unroll LOOP for which we are able to count number of iterations in runtime
1062    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1063    extra care for case n < 0):
1064
1065    for (i = 0; i < n; i++)
1066      body;
1067
1068    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1069
1070    i = 0;
1071    mod = n % 4;
1072
1073    switch (mod)
1074      {
1075        case 3:
1076          body; i++;
1077        case 2:
1078          body; i++;
1079        case 1:
1080          body; i++;
1081        case 0: ;
1082      }
1083
1084    while (i < n)
1085      {
1086        body; i++;
1087        body; i++;
1088        body; i++;
1089        body; i++;
1090      }
1091    */
1092 static void
1093 unroll_loop_runtime_iterations (struct loop *loop)
1094 {
1095   rtx old_niter, niter, init_code, branch_code, tmp;
1096   unsigned i, j, p;
1097   basic_block preheader, *body, swtch, ezc_swtch;
1098   vec<basic_block> dom_bbs;
1099   sbitmap wont_exit;
1100   int may_exit_copy;
1101   unsigned n_peel;
1102   vec<edge> remove_edges;
1103   edge e;
1104   bool extra_zero_check, last_may_exit;
1105   unsigned max_unroll = loop->lpt_decision.times;
1106   struct niter_desc *desc = get_simple_loop_desc (loop);
1107   bool exit_at_end = loop_exit_at_end_p (loop);
1108   struct opt_info *opt_info = NULL;
1109   bool ok;
1110
1111   if (flag_split_ivs_in_unroller
1112       || flag_variable_expansion_in_unroller)
1113     opt_info = analyze_insns_in_loop (loop);
1114
1115   /* Remember blocks whose dominators will have to be updated.  */
1116   dom_bbs.create (0);
1117
1118   body = get_loop_body (loop);
1119   for (i = 0; i < loop->num_nodes; i++)
1120     {
1121       vec<basic_block> ldom;
1122       basic_block bb;
1123
1124       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1125       FOR_EACH_VEC_ELT (ldom, j, bb)
1126         if (!flow_bb_inside_loop_p (loop, bb))
1127           dom_bbs.safe_push (bb);
1128
1129       ldom.release ();
1130     }
1131   free (body);
1132
1133   if (!exit_at_end)
1134     {
1135       /* Leave exit in first copy (for explanation why see comment in
1136          unroll_loop_constant_iterations).  */
1137       may_exit_copy = 0;
1138       n_peel = max_unroll - 1;
1139       extra_zero_check = true;
1140       last_may_exit = false;
1141     }
1142   else
1143     {
1144       /* Leave exit in last copy (for explanation why see comment in
1145          unroll_loop_constant_iterations).  */
1146       may_exit_copy = max_unroll;
1147       n_peel = max_unroll;
1148       extra_zero_check = false;
1149       last_may_exit = true;
1150     }
1151
1152   /* Get expression for number of iterations.  */
1153   start_sequence ();
1154   old_niter = niter = gen_reg_rtx (desc->mode);
1155   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1156   if (tmp != niter)
1157     emit_move_insn (niter, tmp);
1158
1159   /* Count modulo by ANDing it with max_unroll; we use the fact that
1160      the number of unrollings is a power of two, and thus this is correct
1161      even if there is overflow in the computation.  */
1162   niter = expand_simple_binop (desc->mode, AND,
1163                                niter,
1164                                GEN_INT (max_unroll),
1165                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1166
1167   init_code = get_insns ();
1168   end_sequence ();
1169   unshare_all_rtl_in_chain (init_code);
1170
1171   /* Precondition the loop.  */
1172   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1173
1174   remove_edges.create (0);
1175
1176   wont_exit = sbitmap_alloc (max_unroll + 2);
1177
1178   /* Peel the first copy of loop body (almost always we must leave exit test
1179      here; the only exception is when we have extra zero check and the number
1180      of iterations is reliable.  Also record the place of (possible) extra
1181      zero check.  */
1182   bitmap_clear (wont_exit);
1183   if (extra_zero_check
1184       && !desc->noloop_assumptions)
1185     bitmap_set_bit (wont_exit, 1);
1186   ezc_swtch = loop_preheader_edge (loop)->src;
1187   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1188                                       1, wont_exit, desc->out_edge,
1189                                       &remove_edges,
1190                                       DLTHE_FLAG_UPDATE_FREQ);
1191   gcc_assert (ok);
1192
1193   /* Record the place where switch will be built for preconditioning.  */
1194   swtch = split_edge (loop_preheader_edge (loop));
1195
1196   for (i = 0; i < n_peel; i++)
1197     {
1198       /* Peel the copy.  */
1199       bitmap_clear (wont_exit);
1200       if (i != n_peel - 1 || !last_may_exit)
1201         bitmap_set_bit (wont_exit, 1);
1202       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1203                                           1, wont_exit, desc->out_edge,
1204                                           &remove_edges,
1205                                           DLTHE_FLAG_UPDATE_FREQ);
1206       gcc_assert (ok);
1207
1208       /* Create item for switch.  */
1209       j = n_peel - i - (extra_zero_check ? 0 : 1);
1210       p = REG_BR_PROB_BASE / (i + 2);
1211
1212       preheader = split_edge (loop_preheader_edge (loop));
1213       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1214                                           block_label (preheader), p,
1215                                           NULL_RTX);
1216
1217       /* We rely on the fact that the compare and jump cannot be optimized out,
1218          and hence the cfg we create is correct.  */
1219       gcc_assert (branch_code != NULL_RTX);
1220
1221       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1222       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1223       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1224       e = make_edge (swtch, preheader,
1225                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1226       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1227       e->probability = p;
1228     }
1229
1230   if (extra_zero_check)
1231     {
1232       /* Add branch for zero iterations.  */
1233       p = REG_BR_PROB_BASE / (max_unroll + 1);
1234       swtch = ezc_swtch;
1235       preheader = split_edge (loop_preheader_edge (loop));
1236       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1237                                           block_label (preheader), p,
1238                                           NULL_RTX);
1239       gcc_assert (branch_code != NULL_RTX);
1240
1241       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1242       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1243       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1244       e = make_edge (swtch, preheader,
1245                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1246       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1247       e->probability = p;
1248     }
1249
1250   /* Recount dominators for outer blocks.  */
1251   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1252
1253   /* And unroll loop.  */
1254
1255   bitmap_ones (wont_exit);
1256   bitmap_clear_bit (wont_exit, may_exit_copy);
1257   opt_info_start_duplication (opt_info);
1258
1259   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1260                                       max_unroll,
1261                                       wont_exit, desc->out_edge,
1262                                       &remove_edges,
1263                                       DLTHE_FLAG_UPDATE_FREQ
1264                                       | (opt_info
1265                                          ? DLTHE_RECORD_COPY_NUMBER
1266                                            : 0));
1267   gcc_assert (ok);
1268
1269   if (opt_info)
1270     {
1271       apply_opt_in_copies (opt_info, max_unroll, true, true);
1272       free_opt_info (opt_info);
1273     }
1274
1275   free (wont_exit);
1276
1277   if (exit_at_end)
1278     {
1279       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1280       /* Find a new in and out edge; they are in the last copy we have
1281          made.  */
1282
1283       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1284         {
1285           desc->out_edge = EDGE_SUCC (exit_block, 0);
1286           desc->in_edge = EDGE_SUCC (exit_block, 1);
1287         }
1288       else
1289         {
1290           desc->out_edge = EDGE_SUCC (exit_block, 1);
1291           desc->in_edge = EDGE_SUCC (exit_block, 0);
1292         }
1293     }
1294
1295   /* Remove the edges.  */
1296   FOR_EACH_VEC_ELT (remove_edges, i, e)
1297     remove_path (e);
1298   remove_edges.release ();
1299
1300   /* We must be careful when updating the number of iterations due to
1301      preconditioning and the fact that the value must be valid at entry
1302      of the loop.  After passing through the above code, we see that
1303      the correct new number of iterations is this:  */
1304   gcc_assert (!desc->const_iter);
1305   desc->niter_expr =
1306     simplify_gen_binary (UDIV, desc->mode, old_niter,
1307                          GEN_INT (max_unroll + 1));
1308   loop->nb_iterations_upper_bound
1309     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1310                                                                    + 1),
1311                                             TRUNC_DIV_EXPR);
1312   if (loop->any_estimate)
1313     loop->nb_iterations_estimate
1314       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1315                                                                   + 1),
1316                                            TRUNC_DIV_EXPR);
1317   if (exit_at_end)
1318     {
1319       desc->niter_expr =
1320         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1321       desc->noloop_assumptions = NULL_RTX;
1322       --loop->nb_iterations_upper_bound;
1323       if (loop->any_estimate
1324           && loop->nb_iterations_estimate != double_int_zero)
1325         --loop->nb_iterations_estimate;
1326       else
1327         loop->any_estimate = false;
1328     }
1329
1330   if (dump_file)
1331     fprintf (dump_file,
1332              ";; Unrolled loop %d times, counting # of iterations "
1333              "in runtime, %i insns\n",
1334              max_unroll, num_loop_insns (loop));
1335
1336   dom_bbs.release ();
1337 }
1338
1339 /* Decide whether to simply peel LOOP and how much.  */
1340 static void
1341 decide_peel_simple (struct loop *loop, int flags)
1342 {
1343   unsigned npeel;
1344   double_int iterations;
1345
1346   if (!(flags & UAP_PEEL))
1347     {
1348       /* We were not asked to, just return back silently.  */
1349       return;
1350     }
1351
1352   if (dump_file)
1353     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1354
1355   /* npeel = number of iterations to peel.  */
1356   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1357   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1358     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1359
1360   /* Skip big loops.  */
1361   if (!npeel)
1362     {
1363       if (dump_file)
1364         fprintf (dump_file, ";; Not considering loop, is too big\n");
1365       return;
1366     }
1367
1368   /* Do not simply peel loops with branches inside -- it increases number
1369      of mispredicts.
1370      Exception is when we do have profile and we however have good chance
1371      to peel proper number of iterations loop will iterate in practice.
1372      TODO: this heuristic needs tunning; while for complette unrolling
1373      the branch inside loop mostly eliminates any improvements, for
1374      peeling it is not the case.  Also a function call inside loop is
1375      also branch from branch prediction POV (and probably better reason
1376      to not unroll/peel).  */
1377   if (num_loop_branches (loop) > 1
1378       && profile_status != PROFILE_READ)
1379     {
1380       if (dump_file)
1381         fprintf (dump_file, ";; Not peeling, contains branches\n");
1382       return;
1383     }
1384
1385   /* If we have realistic estimate on number of iterations, use it.  */
1386   if (estimated_loop_iterations (loop, &iterations))
1387     {
1388       if (double_int::from_shwi (npeel).ule (iterations))
1389         {
1390           if (dump_file)
1391             {
1392               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1393               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1394                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1395               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1396                        npeel);
1397             }
1398           return;
1399         }
1400       npeel = iterations.to_shwi () + 1;
1401     }
1402   /* If we have small enough bound on iterations, we can still peel (completely
1403      unroll).  */
1404   else if (max_loop_iterations (loop, &iterations)
1405            && iterations.ult (double_int::from_shwi (npeel)))
1406     npeel = iterations.to_shwi () + 1;
1407   else
1408     {
1409       /* For now we have no good heuristics to decide whether loop peeling
1410          will be effective, so disable it.  */
1411       if (dump_file)
1412         fprintf (dump_file,
1413                  ";; Not peeling loop, no evidence it will be profitable\n");
1414       return;
1415     }
1416
1417   /* Success.  */
1418   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1419   loop->lpt_decision.times = npeel;
1420 }
1421
1422 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1423
1424    while (cond)
1425      body;
1426
1427    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1428
1429    if (!cond) goto end;
1430    body;
1431    if (!cond) goto end;
1432    body;
1433    if (!cond) goto end;
1434    body;
1435    while (cond)
1436      body;
1437    end: ;
1438    */
1439 static void
1440 peel_loop_simple (struct loop *loop)
1441 {
1442   sbitmap wont_exit;
1443   unsigned npeel = loop->lpt_decision.times;
1444   struct niter_desc *desc = get_simple_loop_desc (loop);
1445   struct opt_info *opt_info = NULL;
1446   bool ok;
1447
1448   if (flag_split_ivs_in_unroller && npeel > 1)
1449     opt_info = analyze_insns_in_loop (loop);
1450
1451   wont_exit = sbitmap_alloc (npeel + 1);
1452   bitmap_clear (wont_exit);
1453
1454   opt_info_start_duplication (opt_info);
1455
1456   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1457                                       npeel, wont_exit, NULL,
1458                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1459                                       | (opt_info
1460                                          ? DLTHE_RECORD_COPY_NUMBER
1461                                            : 0));
1462   gcc_assert (ok);
1463
1464   free (wont_exit);
1465
1466   if (opt_info)
1467     {
1468       apply_opt_in_copies (opt_info, npeel, false, false);
1469       free_opt_info (opt_info);
1470     }
1471
1472   if (desc->simple_p)
1473     {
1474       if (desc->const_iter)
1475         {
1476           desc->niter -= npeel;
1477           desc->niter_expr = GEN_INT (desc->niter);
1478           desc->noloop_assumptions = NULL_RTX;
1479         }
1480       else
1481         {
1482           /* We cannot just update niter_expr, as its value might be clobbered
1483              inside loop.  We could handle this by counting the number into
1484              temporary just like we do in runtime unrolling, but it does not
1485              seem worthwhile.  */
1486           free_simple_loop_desc (loop);
1487         }
1488     }
1489   if (dump_file)
1490     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1491 }
1492
1493 /* Decide whether to unroll LOOP stupidly and how much.  */
1494 static void
1495 decide_unroll_stupid (struct loop *loop, int flags)
1496 {
1497   unsigned nunroll, nunroll_by_av, i;
1498   struct niter_desc *desc;
1499   double_int iterations;
1500
1501   if (!(flags & UAP_UNROLL_ALL))
1502     {
1503       /* We were not asked to, just return back silently.  */
1504       return;
1505     }
1506
1507   if (dump_file)
1508     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1509
1510   /* nunroll = total number of copies of the original loop body in
1511      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1512   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1513   nunroll_by_av
1514     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1515   if (nunroll > nunroll_by_av)
1516     nunroll = nunroll_by_av;
1517   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1518     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1519
1520   if (targetm.loop_unroll_adjust)
1521     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1522
1523   /* Skip big loops.  */
1524   if (nunroll <= 1)
1525     {
1526       if (dump_file)
1527         fprintf (dump_file, ";; Not considering loop, is too big\n");
1528       return;
1529     }
1530
1531   /* Check for simple loops.  */
1532   desc = get_simple_loop_desc (loop);
1533
1534   /* Check simpleness.  */
1535   if (desc->simple_p && !desc->assumptions)
1536     {
1537       if (dump_file)
1538         fprintf (dump_file, ";; The loop is simple\n");
1539       return;
1540     }
1541
1542   /* Do not unroll loops with branches inside -- it increases number
1543      of mispredicts.
1544      TODO: this heuristic needs tunning; call inside the loop body
1545      is also relatively good reason to not unroll.  */
1546   if (num_loop_branches (loop) > 1)
1547     {
1548       if (dump_file)
1549         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1550       return;
1551     }
1552
1553   /* Check whether the loop rolls.  */
1554   if ((estimated_loop_iterations (loop, &iterations)
1555        || max_loop_iterations (loop, &iterations))
1556       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1557     {
1558       if (dump_file)
1559         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1560       return;
1561     }
1562
1563   /* Success.  Now force nunroll to be power of 2, as it seems that this
1564      improves results (partially because of better alignments, partially
1565      because of some dark magic).  */
1566   for (i = 1; 2 * i <= nunroll; i *= 2)
1567     continue;
1568
1569   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1570   loop->lpt_decision.times = i - 1;
1571 }
1572
1573 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1574
1575    while (cond)
1576      body;
1577
1578    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1579
1580    while (cond)
1581      {
1582        body;
1583        if (!cond) break;
1584        body;
1585        if (!cond) break;
1586        body;
1587        if (!cond) break;
1588        body;
1589      }
1590    */
1591 static void
1592 unroll_loop_stupid (struct loop *loop)
1593 {
1594   sbitmap wont_exit;
1595   unsigned nunroll = loop->lpt_decision.times;
1596   struct niter_desc *desc = get_simple_loop_desc (loop);
1597   struct opt_info *opt_info = NULL;
1598   bool ok;
1599
1600   if (flag_split_ivs_in_unroller
1601       || flag_variable_expansion_in_unroller)
1602     opt_info = analyze_insns_in_loop (loop);
1603
1604
1605   wont_exit = sbitmap_alloc (nunroll + 1);
1606   bitmap_clear (wont_exit);
1607   opt_info_start_duplication (opt_info);
1608
1609   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1610                                       nunroll, wont_exit,
1611                                       NULL, NULL,
1612                                       DLTHE_FLAG_UPDATE_FREQ
1613                                       | (opt_info
1614                                          ? DLTHE_RECORD_COPY_NUMBER
1615                                            : 0));
1616   gcc_assert (ok);
1617
1618   if (opt_info)
1619     {
1620       apply_opt_in_copies (opt_info, nunroll, true, true);
1621       free_opt_info (opt_info);
1622     }
1623
1624   free (wont_exit);
1625
1626   if (desc->simple_p)
1627     {
1628       /* We indeed may get here provided that there are nontrivial assumptions
1629          for a loop to be really simple.  We could update the counts, but the
1630          problem is that we are unable to decide which exit will be taken
1631          (not really true in case the number of iterations is constant,
1632          but noone will do anything with this information, so we do not
1633          worry about it).  */
1634       desc->simple_p = false;
1635     }
1636
1637   if (dump_file)
1638     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1639              nunroll, num_loop_insns (loop));
1640 }
1641
1642 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1643    Set *DEBUG_USES to the number of debug insns that reference the
1644    variable.  */
1645
1646 bool
1647 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1648                                   int *debug_uses)
1649 {
1650   basic_block *body, bb;
1651   unsigned i;
1652   int count_ref = 0;
1653   rtx insn;
1654
1655   body = get_loop_body (loop);
1656   for (i = 0; i < loop->num_nodes; i++)
1657     {
1658       bb = body[i];
1659
1660       FOR_BB_INSNS (bb, insn)
1661         if (!rtx_referenced_p (reg, insn))
1662           continue;
1663         else if (DEBUG_INSN_P (insn))
1664           ++*debug_uses;
1665         else if (++count_ref > 1)
1666           break;
1667     }
1668   free (body);
1669   return (count_ref  == 1);
1670 }
1671
1672 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1673
1674 static void
1675 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1676 {
1677   basic_block *body, bb;
1678   unsigned i;
1679   rtx insn;
1680
1681   body = get_loop_body (loop);
1682   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1683     {
1684       bb = body[i];
1685
1686       FOR_BB_INSNS (bb, insn)
1687         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1688           continue;
1689         else
1690           {
1691             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1692                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1693             if (!--debug_uses)
1694               break;
1695           }
1696     }
1697   free (body);
1698 }
1699
1700 /* Determine whether INSN contains an accumulator
1701    which can be expanded into separate copies,
1702    one for each copy of the LOOP body.
1703
1704    for (i = 0 ; i < n; i++)
1705      sum += a[i];
1706
1707    ==>
1708
1709    sum += a[i]
1710    ....
1711    i = i+1;
1712    sum1 += a[i]
1713    ....
1714    i = i+1
1715    sum2 += a[i];
1716    ....
1717
1718    Return NULL if INSN contains no opportunity for expansion of accumulator.
1719    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1720    information and return a pointer to it.
1721 */
1722
1723 static struct var_to_expand *
1724 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1725 {
1726   rtx set, dest, src;
1727   struct var_to_expand *ves;
1728   unsigned accum_pos;
1729   enum rtx_code code;
1730   int debug_uses = 0;
1731
1732   set = single_set (insn);
1733   if (!set)
1734     return NULL;
1735
1736   dest = SET_DEST (set);
1737   src = SET_SRC (set);
1738   code = GET_CODE (src);
1739
1740   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1741     return NULL;
1742
1743   if (FLOAT_MODE_P (GET_MODE (dest)))
1744     {
1745       if (!flag_associative_math)
1746         return NULL;
1747       /* In the case of FMA, we're also changing the rounding.  */
1748       if (code == FMA && !flag_unsafe_math_optimizations)
1749         return NULL;
1750     }
1751
1752   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1753      in MD.  But if there is no optab to generate the insn, we can not
1754      perform the variable expansion.  This can happen if an MD provides
1755      an insn but not a named pattern to generate it, for example to avoid
1756      producing code that needs additional mode switches like for x87/mmx.
1757
1758      So we check have_insn_for which looks for an optab for the operation
1759      in SRC.  If it doesn't exist, we can't perform the expansion even
1760      though INSN is valid.  */
1761   if (!have_insn_for (code, GET_MODE (src)))
1762     return NULL;
1763
1764   if (!REG_P (dest)
1765       && !(GET_CODE (dest) == SUBREG
1766            && REG_P (SUBREG_REG (dest))))
1767     return NULL;
1768
1769   /* Find the accumulator use within the operation.  */
1770   if (code == FMA)
1771     {
1772       /* We only support accumulation via FMA in the ADD position.  */
1773       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1774         return NULL;
1775       accum_pos = 2;
1776     }
1777   else if (rtx_equal_p (dest, XEXP (src, 0)))
1778     accum_pos = 0;
1779   else if (rtx_equal_p (dest, XEXP (src, 1)))
1780     {
1781       /* The method of expansion that we are using; which includes the
1782          initialization of the expansions with zero and the summation of
1783          the expansions at the end of the computation will yield wrong
1784          results for (x = something - x) thus avoid using it in that case.  */
1785       if (code == MINUS)
1786         return NULL;
1787       accum_pos = 1;
1788     }
1789   else
1790     return NULL;
1791
1792   /* It must not otherwise be used.  */
1793   if (code == FMA)
1794     {
1795       if (rtx_referenced_p (dest, XEXP (src, 0))
1796           || rtx_referenced_p (dest, XEXP (src, 1)))
1797         return NULL;
1798     }
1799   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1800     return NULL;
1801
1802   /* It must be used in exactly one insn.  */
1803   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1804     return NULL;
1805
1806   if (dump_file)
1807     {
1808       fprintf (dump_file, "\n;; Expanding Accumulator ");
1809       print_rtl (dump_file, dest);
1810       fprintf (dump_file, "\n");
1811     }
1812
1813   if (debug_uses)
1814     /* Instead of resetting the debug insns, we could replace each
1815        debug use in the loop with the sum or product of all expanded
1816        accummulators.  Since we'll only know of all expansions at the
1817        end, we'd have to keep track of which vars_to_expand a debug
1818        insn in the loop references, take note of each copy of the
1819        debug insn during unrolling, and when it's all done, compute
1820        the sum or product of each variable and adjust the original
1821        debug insn and each copy thereof.  What a pain!  */
1822     reset_debug_uses_in_loop (loop, dest, debug_uses);
1823
1824   /* Record the accumulator to expand.  */
1825   ves = XNEW (struct var_to_expand);
1826   ves->insn = insn;
1827   ves->reg = copy_rtx (dest);
1828   ves->var_expansions.create (1);
1829   ves->next = NULL;
1830   ves->op = GET_CODE (src);
1831   ves->expansion_count = 0;
1832   ves->reuse_expansion = 0;
1833   return ves;
1834 }
1835
1836 /* Determine whether there is an induction variable in INSN that
1837    we would like to split during unrolling.
1838
1839    I.e. replace
1840
1841    i = i + 1;
1842    ...
1843    i = i + 1;
1844    ...
1845    i = i + 1;
1846    ...
1847
1848    type chains by
1849
1850    i0 = i + 1
1851    ...
1852    i = i0 + 1
1853    ...
1854    i = i0 + 2
1855    ...
1856
1857    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1858    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1859    pointer to it.  */
1860
1861 static struct iv_to_split *
1862 analyze_iv_to_split_insn (rtx insn)
1863 {
1864   rtx set, dest;
1865   struct rtx_iv iv;
1866   struct iv_to_split *ivts;
1867   bool ok;
1868
1869   /* For now we just split the basic induction variables.  Later this may be
1870      extended for example by selecting also addresses of memory references.  */
1871   set = single_set (insn);
1872   if (!set)
1873     return NULL;
1874
1875   dest = SET_DEST (set);
1876   if (!REG_P (dest))
1877     return NULL;
1878
1879   if (!biv_p (insn, dest))
1880     return NULL;
1881
1882   ok = iv_analyze_result (insn, dest, &iv);
1883
1884   /* This used to be an assert under the assumption that if biv_p returns
1885      true that iv_analyze_result must also return true.  However, that
1886      assumption is not strictly correct as evidenced by pr25569.
1887
1888      Returning NULL when iv_analyze_result returns false is safe and
1889      avoids the problems in pr25569 until the iv_analyze_* routines
1890      can be fixed, which is apparently hard and time consuming
1891      according to their author.  */
1892   if (! ok)
1893     return NULL;
1894
1895   if (iv.step == const0_rtx
1896       || iv.mode != iv.extend_mode)
1897     return NULL;
1898
1899   /* Record the insn to split.  */
1900   ivts = XNEW (struct iv_to_split);
1901   ivts->insn = insn;
1902   ivts->orig_var = dest;
1903   ivts->base_var = NULL_RTX;
1904   ivts->step = iv.step;
1905   ivts->next = NULL;
1906   ivts->n_loc = 1;
1907   ivts->loc[0] = 1;
1908
1909   return ivts;
1910 }
1911
1912 /* Determines which of insns in LOOP can be optimized.
1913    Return a OPT_INFO struct with the relevant hash tables filled
1914    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1915    is undefined for the return value.  */
1916
1917 static struct opt_info *
1918 analyze_insns_in_loop (struct loop *loop)
1919 {
1920   basic_block *body, bb;
1921   unsigned i;
1922   struct opt_info *opt_info = XCNEW (struct opt_info);
1923   rtx insn;
1924   struct iv_to_split *ivts = NULL;
1925   struct var_to_expand *ves = NULL;
1926   iv_to_split **slot1;
1927   var_to_expand **slot2;
1928   vec<edge> edges = get_loop_exit_edges (loop);
1929   edge exit;
1930   bool can_apply = false;
1931
1932   iv_analysis_loop_init (loop);
1933
1934   body = get_loop_body (loop);
1935
1936   if (flag_split_ivs_in_unroller)
1937     {
1938       opt_info->insns_to_split.create (5 * loop->num_nodes);
1939       opt_info->iv_to_split_head = NULL;
1940       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1941     }
1942
1943   /* Record the loop exit bb and loop preheader before the unrolling.  */
1944   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1945
1946   if (edges.length () == 1)
1947     {
1948       exit = edges[0];
1949       if (!(exit->flags & EDGE_COMPLEX))
1950         {
1951           opt_info->loop_exit = split_edge (exit);
1952           can_apply = true;
1953         }
1954     }
1955
1956   if (flag_variable_expansion_in_unroller
1957       && can_apply)
1958     {
1959       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1960       opt_info->var_to_expand_head = NULL;
1961       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1962     }
1963
1964   for (i = 0; i < loop->num_nodes; i++)
1965     {
1966       bb = body[i];
1967       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1968         continue;
1969
1970       FOR_BB_INSNS (bb, insn)
1971       {
1972         if (!INSN_P (insn))
1973           continue;
1974
1975         if (opt_info->insns_to_split.is_created ())
1976           ivts = analyze_iv_to_split_insn (insn);
1977
1978         if (ivts)
1979           {
1980             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1981             gcc_assert (*slot1 == NULL);
1982             *slot1 = ivts;
1983             *opt_info->iv_to_split_tail = ivts;
1984             opt_info->iv_to_split_tail = &ivts->next;
1985             continue;
1986           }
1987
1988         if (opt_info->insns_with_var_to_expand.is_created ())
1989           ves = analyze_insn_to_expand_var (loop, insn);
1990
1991         if (ves)
1992           {
1993             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1994             gcc_assert (*slot2 == NULL);
1995             *slot2 = ves;
1996             *opt_info->var_to_expand_tail = ves;
1997             opt_info->var_to_expand_tail = &ves->next;
1998           }
1999       }
2000     }
2001
2002   edges.release ();
2003   free (body);
2004   return opt_info;
2005 }
2006
2007 /* Called just before loop duplication.  Records start of duplicated area
2008    to OPT_INFO.  */
2009
2010 static void
2011 opt_info_start_duplication (struct opt_info *opt_info)
2012 {
2013   if (opt_info)
2014     opt_info->first_new_block = last_basic_block;
2015 }
2016
2017 /* Determine the number of iterations between initialization of the base
2018    variable and the current copy (N_COPY).  N_COPIES is the total number
2019    of newly created copies.  UNROLLING is true if we are unrolling
2020    (not peeling) the loop.  */
2021
2022 static unsigned
2023 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2024 {
2025   if (unrolling)
2026     {
2027       /* If we are unrolling, initialization is done in the original loop
2028          body (number 0).  */
2029       return n_copy;
2030     }
2031   else
2032     {
2033       /* If we are peeling, the copy in that the initialization occurs has
2034          number 1.  The original loop (number 0) is the last.  */
2035       if (n_copy)
2036         return n_copy - 1;
2037       else
2038         return n_copies;
2039     }
2040 }
2041
2042 /* Locate in EXPR the expression corresponding to the location recorded
2043    in IVTS, and return a pointer to the RTX for this location.  */
2044
2045 static rtx *
2046 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2047 {
2048   unsigned i;
2049   rtx *ret = &expr;
2050
2051   for (i = 0; i < ivts->n_loc; i++)
2052     ret = &XEXP (*ret, ivts->loc[i]);
2053
2054   return ret;
2055 }
2056
2057 /* Allocate basic variable for the induction variable chain.  */
2058
2059 static void
2060 allocate_basic_variable (struct iv_to_split *ivts)
2061 {
2062   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2063
2064   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2065 }
2066
2067 /* Insert initialization of basic variable of IVTS before INSN, taking
2068    the initial value from INSN.  */
2069
2070 static void
2071 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2072 {
2073   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2074   rtx seq;
2075
2076   start_sequence ();
2077   expr = force_operand (expr, ivts->base_var);
2078   if (expr != ivts->base_var)
2079     emit_move_insn (ivts->base_var, expr);
2080   seq = get_insns ();
2081   end_sequence ();
2082
2083   emit_insn_before (seq, insn);
2084 }
2085
2086 /* Replace the use of induction variable described in IVTS in INSN
2087    by base variable + DELTA * step.  */
2088
2089 static void
2090 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2091 {
2092   rtx expr, *loc, seq, incr, var;
2093   enum machine_mode mode = GET_MODE (ivts->base_var);
2094   rtx src, dest, set;
2095
2096   /* Construct base + DELTA * step.  */
2097   if (!delta)
2098     expr = ivts->base_var;
2099   else
2100     {
2101       incr = simplify_gen_binary (MULT, mode,
2102                                   ivts->step, gen_int_mode (delta, mode));
2103       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2104                                   ivts->base_var, incr);
2105     }
2106
2107   /* Figure out where to do the replacement.  */
2108   loc = get_ivts_expr (single_set (insn), ivts);
2109
2110   /* If we can make the replacement right away, we're done.  */
2111   if (validate_change (insn, loc, expr, 0))
2112     return;
2113
2114   /* Otherwise, force EXPR into a register and try again.  */
2115   start_sequence ();
2116   var = gen_reg_rtx (mode);
2117   expr = force_operand (expr, var);
2118   if (expr != var)
2119     emit_move_insn (var, expr);
2120   seq = get_insns ();
2121   end_sequence ();
2122   emit_insn_before (seq, insn);
2123
2124   if (validate_change (insn, loc, var, 0))
2125     return;
2126
2127   /* The last chance.  Try recreating the assignment in insn
2128      completely from scratch.  */
2129   set = single_set (insn);
2130   gcc_assert (set);
2131
2132   start_sequence ();
2133   *loc = var;
2134   src = copy_rtx (SET_SRC (set));
2135   dest = copy_rtx (SET_DEST (set));
2136   src = force_operand (src, dest);
2137   if (src != dest)
2138     emit_move_insn (dest, src);
2139   seq = get_insns ();
2140   end_sequence ();
2141
2142   emit_insn_before (seq, insn);
2143   delete_insn (insn);
2144 }
2145
2146
2147 /* Return one expansion of the accumulator recorded in struct VE.  */
2148
2149 static rtx
2150 get_expansion (struct var_to_expand *ve)
2151 {
2152   rtx reg;
2153
2154   if (ve->reuse_expansion == 0)
2155     reg = ve->reg;
2156   else
2157     reg = ve->var_expansions[ve->reuse_expansion - 1];
2158
2159   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2160     ve->reuse_expansion = 0;
2161   else
2162     ve->reuse_expansion++;
2163
2164   return reg;
2165 }
2166
2167
2168 /* Given INSN replace the uses of the accumulator recorded in VE
2169    with a new register.  */
2170
2171 static void
2172 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2173 {
2174   rtx new_reg, set;
2175   bool really_new_expansion = false;
2176
2177   set = single_set (insn);
2178   gcc_assert (set);
2179
2180   /* Generate a new register only if the expansion limit has not been
2181      reached.  Else reuse an already existing expansion.  */
2182   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2183     {
2184       really_new_expansion = true;
2185       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2186     }
2187   else
2188     new_reg = get_expansion (ve);
2189
2190   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2191   if (apply_change_group ())
2192     if (really_new_expansion)
2193       {
2194         ve->var_expansions.safe_push (new_reg);
2195         ve->expansion_count++;
2196       }
2197 }
2198
2199 /* Initialize the variable expansions in loop preheader.  PLACE is the
2200    loop-preheader basic block where the initialization of the
2201    expansions should take place.  The expansions are initialized with
2202    (-0) when the operation is plus or minus to honor sign zero.  This
2203    way we can prevent cases where the sign of the final result is
2204    effected by the sign of the expansion.  Here is an example to
2205    demonstrate this:
2206
2207    for (i = 0 ; i < n; i++)
2208      sum += something;
2209
2210    ==>
2211
2212    sum += something
2213    ....
2214    i = i+1;
2215    sum1 += something
2216    ....
2217    i = i+1
2218    sum2 += something;
2219    ....
2220
2221    When SUM is initialized with -zero and SOMETHING is also -zero; the
2222    final result of sum should be -zero thus the expansions sum1 and sum2
2223    should be initialized with -zero as well (otherwise we will get +zero
2224    as the final result).  */
2225
2226 static void
2227 insert_var_expansion_initialization (struct var_to_expand *ve,
2228                                      basic_block place)
2229 {
2230   rtx seq, var, zero_init;
2231   unsigned i;
2232   enum machine_mode mode = GET_MODE (ve->reg);
2233   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2234
2235   if (ve->var_expansions.length () == 0)
2236     return;
2237
2238   start_sequence ();
2239   switch (ve->op)
2240     {
2241     case FMA:
2242       /* Note that we only accumulate FMA via the ADD operand.  */
2243     case PLUS:
2244     case MINUS:
2245       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2246         {
2247           if (honor_signed_zero_p)
2248             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2249           else
2250             zero_init = CONST0_RTX (mode);
2251           emit_move_insn (var, zero_init);
2252         }
2253       break;
2254
2255     case MULT:
2256       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2257         {
2258           zero_init = CONST1_RTX (GET_MODE (var));
2259           emit_move_insn (var, zero_init);
2260         }
2261       break;
2262
2263     default:
2264       gcc_unreachable ();
2265     }
2266
2267   seq = get_insns ();
2268   end_sequence ();
2269
2270   emit_insn_after (seq, BB_END (place));
2271 }
2272
2273 /* Combine the variable expansions at the loop exit.  PLACE is the
2274    loop exit basic block where the summation of the expansions should
2275    take place.  */
2276
2277 static void
2278 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2279 {
2280   rtx sum = ve->reg;
2281   rtx expr, seq, var, insn;
2282   unsigned i;
2283
2284   if (ve->var_expansions.length () == 0)
2285     return;
2286
2287   start_sequence ();
2288   switch (ve->op)
2289     {
2290     case FMA:
2291       /* Note that we only accumulate FMA via the ADD operand.  */
2292     case PLUS:
2293     case MINUS:
2294       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2295         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2296       break;
2297
2298     case MULT:
2299       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2300         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2301       break;
2302
2303     default:
2304       gcc_unreachable ();
2305     }
2306
2307   expr = force_operand (sum, ve->reg);
2308   if (expr != ve->reg)
2309     emit_move_insn (ve->reg, expr);
2310   seq = get_insns ();
2311   end_sequence ();
2312
2313   insn = BB_HEAD (place);
2314   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2315     insn = NEXT_INSN (insn);
2316
2317   emit_insn_after (seq, insn);
2318 }
2319
2320 /* Strip away REG_EQUAL notes for IVs we're splitting.
2321
2322    Updating REG_EQUAL notes for IVs we split is tricky: We
2323    cannot tell until after unrolling, DF-rescanning, and liveness
2324    updating, whether an EQ_USE is reached by the split IV while
2325    the IV reg is still live.  See PR55006.
2326
2327    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2328    because RTL loop-iv requires us to defer rescanning insns and
2329    any notes attached to them.  So resort to old techniques...  */
2330
2331 static void
2332 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2333 {
2334   struct iv_to_split *ivts;
2335   rtx note = find_reg_equal_equiv_note (insn);
2336   if (! note)
2337     return;
2338   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2339     if (reg_mentioned_p (ivts->orig_var, note))
2340       {
2341         remove_note (insn, note);
2342         return;
2343       }
2344 }
2345
2346 /* Apply loop optimizations in loop copies using the
2347    data which gathered during the unrolling.  Structure
2348    OPT_INFO record that data.
2349
2350    UNROLLING is true if we unrolled (not peeled) the loop.
2351    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2352    the loop (as it should happen in complete unrolling, but not in ordinary
2353    peeling of the loop).  */
2354
2355 static void
2356 apply_opt_in_copies (struct opt_info *opt_info,
2357                      unsigned n_copies, bool unrolling,
2358                      bool rewrite_original_loop)
2359 {
2360   unsigned i, delta;
2361   basic_block bb, orig_bb;
2362   rtx insn, orig_insn, next;
2363   struct iv_to_split ivts_templ, *ivts;
2364   struct var_to_expand ve_templ, *ves;
2365
2366   /* Sanity check -- we need to put initialization in the original loop
2367      body.  */
2368   gcc_assert (!unrolling || rewrite_original_loop);
2369
2370   /* Allocate the basic variables (i0).  */
2371   if (opt_info->insns_to_split.is_created ())
2372     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2373       allocate_basic_variable (ivts);
2374
2375   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2376     {
2377       bb = BASIC_BLOCK (i);
2378       orig_bb = get_bb_original (bb);
2379
2380       /* bb->aux holds position in copy sequence initialized by
2381          duplicate_loop_to_header_edge.  */
2382       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2383                                         unrolling);
2384       bb->aux = 0;
2385       orig_insn = BB_HEAD (orig_bb);
2386       FOR_BB_INSNS_SAFE (bb, insn, next)
2387         {
2388           if (!INSN_P (insn)
2389               || (DEBUG_INSN_P (insn)
2390                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2391             continue;
2392
2393           while (!INSN_P (orig_insn)
2394                  || (DEBUG_INSN_P (orig_insn)
2395                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2396                          == LABEL_DECL)))
2397             orig_insn = NEXT_INSN (orig_insn);
2398
2399           ivts_templ.insn = orig_insn;
2400           ve_templ.insn = orig_insn;
2401
2402           /* Apply splitting iv optimization.  */
2403           if (opt_info->insns_to_split.is_created ())
2404             {
2405               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2406
2407               ivts = opt_info->insns_to_split.find (&ivts_templ);
2408
2409               if (ivts)
2410                 {
2411                   gcc_assert (GET_CODE (PATTERN (insn))
2412                               == GET_CODE (PATTERN (orig_insn)));
2413
2414                   if (!delta)
2415                     insert_base_initialization (ivts, insn);
2416                   split_iv (ivts, insn, delta);
2417                 }
2418             }
2419           /* Apply variable expansion optimization.  */
2420           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2421             {
2422               ves = (struct var_to_expand *)
2423                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2424               if (ves)
2425                 {
2426                   gcc_assert (GET_CODE (PATTERN (insn))
2427                               == GET_CODE (PATTERN (orig_insn)));
2428                   expand_var_during_unrolling (ves, insn);
2429                 }
2430             }
2431           orig_insn = NEXT_INSN (orig_insn);
2432         }
2433     }
2434
2435   if (!rewrite_original_loop)
2436     return;
2437
2438   /* Initialize the variable expansions in the loop preheader
2439      and take care of combining them at the loop exit.  */
2440   if (opt_info->insns_with_var_to_expand.is_created ())
2441     {
2442       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2443         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2444       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2445         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2446     }
2447
2448   /* Rewrite also the original loop body.  Find them as originals of the blocks
2449      in the last copied iteration, i.e. those that have
2450      get_bb_copy (get_bb_original (bb)) == bb.  */
2451   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2452     {
2453       bb = BASIC_BLOCK (i);
2454       orig_bb = get_bb_original (bb);
2455       if (get_bb_copy (orig_bb) != bb)
2456         continue;
2457
2458       delta = determine_split_iv_delta (0, n_copies, unrolling);
2459       for (orig_insn = BB_HEAD (orig_bb);
2460            orig_insn != NEXT_INSN (BB_END (bb));
2461            orig_insn = next)
2462         {
2463           next = NEXT_INSN (orig_insn);
2464
2465           if (!INSN_P (orig_insn))
2466             continue;
2467
2468           ivts_templ.insn = orig_insn;
2469           if (opt_info->insns_to_split.is_created ())
2470             {
2471               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2472
2473               ivts = (struct iv_to_split *)
2474                 opt_info->insns_to_split.find (&ivts_templ);
2475               if (ivts)
2476                 {
2477                   if (!delta)
2478                     insert_base_initialization (ivts, orig_insn);
2479                   split_iv (ivts, orig_insn, delta);
2480                   continue;
2481                 }
2482             }
2483
2484         }
2485     }
2486 }
2487
2488 /* Release OPT_INFO.  */
2489
2490 static void
2491 free_opt_info (struct opt_info *opt_info)
2492 {
2493   if (opt_info->insns_to_split.is_created ())
2494     opt_info->insns_to_split.dispose ();
2495   if (opt_info->insns_with_var_to_expand.is_created ())
2496     {
2497       struct var_to_expand *ves;
2498
2499       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2500         ves->var_expansions.release ();
2501       opt_info->insns_with_var_to_expand.dispose ();
2502     }
2503   free (opt_info);
2504 }