gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36 #include "plugin.h"
  37
  38 /* This pass performs loop unrolling and peeling.  We only perform these
  39    optimizations on innermost loops (with single exception) because
  40    the impact on performance is greatest here, and we want to avoid
  41    unnecessary code size growth.  The gain is caused by greater sequentiality
  42    of code, better code to optimize for further passes and in some cases
  43    by fewer testings of exit conditions.  The main problem is code growth,
  44    that impacts performance negatively due to effect of caches.
  45
  46    What we do:
  47
  48    -- complete peeling of once-rolling loops; this is the above mentioned
  49       exception, as this causes loop to be cancelled completely and
  50       does not cause code growth
  51    -- complete peeling of loops that roll (small) constant times.
  52    -- simple peeling of first iterations of loops that do not roll much
  53       (according to profile feedback)
  54    -- unrolling of loops that roll constant times; this is almost always
  55       win, as we get rid of exit condition tests.
  56    -- unrolling of loops that roll number of times that we can compute
  57       in runtime; we also get rid of exit condition tests here, but there
  58       is the extra expense for calculating the number of iterations
  59    -- simple unrolling of remaining loops; this is performed only if we
  60       are asked to, as the gain is questionable in this case and often
  61       it may even slow down the code
  62    For more detailed descriptions of each of those, see comments at
  63    appropriate function below.
  64
  65    There is a lot of parameters (defined and described in params.def) that
  66    control how much we unroll/peel.
  67
  68    ??? A great problem is that we don't have a good way how to determine
  69    how many times we should unroll the loop; the experiments I have made
  70    showed that this choice may affect performance in order of several %.
  71    */
  72
  73 /* Information about induction variables to split.  */
  74
  75 struct iv_to_split
  76 {
  77   rtx insn;             /* The insn in that the induction variable occurs.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104   unsigned accum_pos;              /* The position in which the accumulator is placed in
 105                                       the insn src.  For example in x = x + something
 106                                       accum_pos is 0 while in x = something + x accum_pos
 107                                       is 1.  */
 108 };
 109
 110 /* Information about optimization applied in
 111    the unrolled loop.  */
 112
 113 struct opt_info
 114 {
 115   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 116   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 117   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 118   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 119                                       to expand.  */
 120   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 121   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 122   unsigned first_new_block;        /* The first basic block that was
 123                                       duplicated.  */
 124   basic_block loop_exit;           /* The loop exit basic block.  */
 125   basic_block loop_preheader;      /* The loop preheader basic block.  */
 126 };
 127
 128 static void decide_unrolling_and_peeling (int);
 129 static void peel_loops_completely (int);
 130 static void decide_peel_simple (struct loop *, int);
 131 static void decide_peel_once_rolling (struct loop *, int);
 132 static void decide_peel_completely (struct loop *, int);
 133 static void decide_unroll_stupid (struct loop *, int);
 134 static void decide_unroll_constant_iterations (struct loop *, int);
 135 static void decide_unroll_runtime_iterations (struct loop *, int);
 136 static void peel_loop_simple (struct loop *);
 137 static void peel_loop_completely (struct loop *);
 138 static void unroll_loop_stupid (struct loop *);
 139 static void unroll_loop_constant_iterations (struct loop *);
 140 static void unroll_loop_runtime_iterations (struct loop *);
 141 static struct opt_info *analyze_insns_in_loop (struct loop *);
 142 static void opt_info_start_duplication (struct opt_info *);
 143 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 144 static void free_opt_info (struct opt_info *);
 145 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 146 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx);
 147 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 148 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 149 static void insert_var_expansion_initialization (struct var_to_expand *,
 150                                                  basic_block);
 151 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 152                                              basic_block);
 153 static rtx get_expansion (struct var_to_expand *);
 154
 155 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 156 void
 157 unroll_and_peel_loops (int flags)
 158 {
 159   struct loop *loop;
 160   bool check;
 161   loop_iterator li;
 162
 163   /* First perform complete loop peeling (it is almost surely a win,
 164      and affects parameters for further decision a lot).  */
 165   peel_loops_completely (flags);
 166
 167   /* Now decide rest of unrolling and peeling.  */
 168   decide_unrolling_and_peeling (flags);
 169
 170   /* Scan the loops, inner ones first.  */
 171   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 172     {
 173       check = true;
 174
 175       /* And perform the appropriate transformations.  */
 176       switch (loop->lpt_decision.decision)
 177         {
 178         case LPT_PEEL_COMPLETELY:
 179           /* Already done.  */
 180           gcc_unreachable ();
 181         case LPT_PEEL_SIMPLE:
 182           peel_loop_simple (loop);
 183           break;
 184         case LPT_UNROLL_CONSTANT:
 185           unroll_loop_constant_iterations (loop);
 186           break;
 187         case LPT_UNROLL_RUNTIME:
 188           unroll_loop_runtime_iterations (loop);
 189           break;
 190         case LPT_UNROLL_STUPID:
 191           unroll_loop_stupid (loop);
 192           break;
 193         case LPT_NONE:
 194           check = false;
 195           break;
 196         default:
 197           gcc_unreachable ();
 198         }
 199       if (check)
 200         {
 201 #ifdef ENABLE_CHECKING
 202           verify_dominators (CDI_DOMINATORS);
 203           verify_loop_structure ();
 204 #endif
 205         }
 206     }
 207
 208   iv_analysis_done ();
 209 }
 210
 211 /* Check whether exit of the LOOP is at the end of loop body.  */
 212
 213 static bool
 214 loop_exit_at_end_p (struct loop *loop)
 215 {
 216   struct niter_desc *desc = get_simple_loop_desc (loop);
 217   rtx insn;
 218
 219   if (desc->in_edge->dest != loop->latch)
 220     return false;
 221
 222   /* Check that the latch is empty.  */
 223   FOR_BB_INSNS (loop->latch, insn)
 224     {
 225       if (INSN_P (insn))
 226         return false;
 227     }
 228
 229   return true;
 230 }
 231
 232 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 233 static void
 234 peel_loops_completely (int flags)
 235 {
 236   struct loop *loop;
 237   loop_iterator li;
 238
 239   /* Scan the loops, the inner ones first.  */
 240   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 241     {
 242       loop->lpt_decision.decision = LPT_NONE;
 243
 244       if (dump_file)
 245         fprintf (dump_file,
 246                  "\n;; *** Considering loop %d for complete peeling ***\n",
 247                  loop->num);
 248
 249       loop->ninsns = num_loop_insns (loop);
 250
 251       decide_peel_once_rolling (loop, flags);
 252       if (loop->lpt_decision.decision == LPT_NONE)
 253         decide_peel_completely (loop, flags);
 254
 255       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 256         {
 257           peel_loop_completely (loop);
 258 #ifdef ENABLE_CHECKING
 259           verify_dominators (CDI_DOMINATORS);
 260           verify_loop_structure ();
 261 #endif
 262         }
 263     }
 264 }
 265
 266 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 267 static void
 268 decide_unrolling_and_peeling (int flags)
 269 {
 270   struct loop *loop;
 271   loop_iterator li;
 272
 273   /* ICI: parameter holders  */
 274
 275   /* Scan the loops, inner ones first.  */
 276   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 277     {
 278       loop->lpt_decision.decision = LPT_NONE;
 279
 280       if (dump_file)
 281         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 282
 283
 284       /* Do not peel cold areas.  */
 285       if (optimize_loop_for_size_p (loop))
 286         {
 287           static bool ici_optimize_loop_for_size_p;
 288           ici_optimize_loop_for_size_p = 1;
 289           invoke_plugin_va_callbacks
 290             (PLUGIN_UNROLL_PARAMETER_HANDLER,
 291              "_loop.id", EP_INT, &(loop->num),
 292              "loop.optimize_loop_for_size_p", EP_UNSIGNED_CHAR,
 293              &ici_optimize_loop_for_size_p, NULL);
 294
 295           if (ici_optimize_loop_for_size_p)
 296             {
 297               if (dump_file)
 298                 fprintf (dump_file, ";; Not considering loop, cold area\n");
 299               continue;
 300             }
 301         }
 302
 303       /* Can the loop be manipulated?  */
 304       if (!can_duplicate_loop_p (loop))
 305         {
 306           static bool ici_can_duplicate_loop_p;
 307           ici_can_duplicate_loop_p = 0;
 308           invoke_plugin_va_callbacks
 309             (PLUGIN_UNROLL_PARAMETER_HANDLER,
 310              "_loop.id", EP_INT, &(loop->num),
 311              "_loop.can_duplicate_loop_p", EP_UNSIGNED_CHAR,
 312              &ici_can_duplicate_loop_p, NULL);
 313
 314           if (dump_file)
 315             fprintf (dump_file,
 316                      ";; Not considering loop, cannot duplicate\n");
 317           continue;
 318         }
 319
 320       /* Skip non-innermost loops.  */
 321       if (loop->inner)
 322         {
 323           static bool ici_is_inner_most;
 324           ici_is_inner_most = 0;
 325           invoke_plugin_va_callbacks
 326             (PLUGIN_UNROLL_PARAMETER_HANDLER,
 327              "_loop.id", EP_INT, &(loop->num),
 328              "_loop.is_inner_most", EP_UNSIGNED_CHAR, &ici_is_inner_most,
 329              NULL);
 330
 331           if (dump_file)
 332             fprintf (dump_file,
 333                      ";; Not considering loop, is not innermost\n");
 334           continue;
 335         }
 336
 337       loop->ninsns = num_loop_insns (loop);
 338       loop->av_ninsns = average_num_loop_insns (loop);
 339
 340       /* Try transformations one by one in decreasing order of
 341          priority.  */
 342
 343       decide_unroll_constant_iterations (loop, flags);
 344       if (loop->lpt_decision.decision == LPT_NONE)
 345         decide_unroll_runtime_iterations (loop, flags);
 346       if (loop->lpt_decision.decision == LPT_NONE)
 347         decide_unroll_stupid (loop, flags);
 348       if (loop->lpt_decision.decision == LPT_NONE)
 349         decide_peel_simple (loop, flags);
 350
 351       invoke_plugin_va_callbacks
 352         (PLUGIN_UNROLL_PARAMETER_HANDLER,
 353          "_loop.id", EP_INT, &(loop->num),
 354          /* ICI: Number of loop insns.  */
 355          "_loop.ninsns", EP_UNSIGNED, &(loop->ninsns),
 356          /* ICI: Average number of executed insns per iteration.  */
 357          "_loop.av_ninsns", EP_UNSIGNED, &(loop->av_ninsns),
 358          /* ICI: Number of blocks contained within the loop.  */
 359          "_loop.num_nodes", EP_UNSIGNED, &(loop->num_nodes),
 360          /* ICI: Loop unrolling/peeling decision.  */
 361          "loop.lpt_decision.times", EP_UNSIGNED, &(loop->lpt_decision.times),
 362          "loop.lpt_decision.decision", EP_INT, /* Enum treated as int.  */
 363          &(loop->lpt_decision.decision), NULL);
 364     }
 365 }
 366
 367 /* Decide whether the LOOP is once rolling and suitable for complete
 368    peeling.  */
 369 static void
 370 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 371 {
 372   struct niter_desc *desc;
 373
 374   if (dump_file)
 375     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 376
 377   /* Is the loop small enough?  */
 378   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 379     {
 380       if (dump_file)
 381         fprintf (dump_file, ";; Not considering loop, is too big\n");
 382       return;
 383     }
 384
 385   /* Check for simple loops.  */
 386   desc = get_simple_loop_desc (loop);
 387
 388   /* Check number of iterations.  */
 389   if (!desc->simple_p
 390       || desc->assumptions
 391       || desc->infinite
 392       || !desc->const_iter
 393       || desc->niter != 0)
 394     {
 395       if (dump_file)
 396         fprintf (dump_file,
 397                  ";; Unable to prove that the loop rolls exactly once\n");
 398       return;
 399     }
 400
 401   /* Success.  */
 402   if (dump_file)
 403     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 404   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 405 }
 406
 407 /* Decide whether the LOOP is suitable for complete peeling.  */
 408 static void
 409 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 410 {
 411   unsigned npeel;
 412   struct niter_desc *desc;
 413
 414   if (dump_file)
 415     fprintf (dump_file, "\n;; Considering peeling completely\n");
 416
 417   /* Skip non-innermost loops.  */
 418   if (loop->inner)
 419     {
 420       if (dump_file)
 421         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 422       return;
 423     }
 424
 425   /* Do not peel cold areas.  */
 426   if (optimize_loop_for_size_p (loop))
 427     {
 428       if (dump_file)
 429         fprintf (dump_file, ";; Not considering loop, cold area\n");
 430       return;
 431     }
 432
 433   /* Can the loop be manipulated?  */
 434   if (!can_duplicate_loop_p (loop))
 435     {
 436       if (dump_file)
 437         fprintf (dump_file,
 438                  ";; Not considering loop, cannot duplicate\n");
 439       return;
 440     }
 441
 442   /* npeel = number of iterations to peel.  */
 443   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 444   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 445     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 446
 447   /* Is the loop small enough?  */
 448   if (!npeel)
 449     {
 450       if (dump_file)
 451         fprintf (dump_file, ";; Not considering loop, is too big\n");
 452       return;
 453     }
 454
 455   /* Check for simple loops.  */
 456   desc = get_simple_loop_desc (loop);
 457
 458   /* Check number of iterations.  */
 459   if (!desc->simple_p
 460       || desc->assumptions
 461       || !desc->const_iter
 462       || desc->infinite)
 463     {
 464       if (dump_file)
 465         fprintf (dump_file,
 466                  ";; Unable to prove that the loop iterates constant times\n");
 467       return;
 468     }
 469
 470   if (desc->niter > npeel - 1)
 471     {
 472       if (dump_file)
 473         {
 474           fprintf (dump_file,
 475                    ";; Not peeling loop completely, rolls too much (");
 476           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 477           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 478         }
 479       return;
 480     }
 481
 482   /* Success.  */
 483   if (dump_file)
 484     fprintf (dump_file, ";; Decided to peel loop completely\n");
 485   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 486 }
 487
 488 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 489    completely.  The transformation done:
 490
 491    for (i = 0; i < 4; i++)
 492      body;
 493
 494    ==>
 495
 496    i = 0;
 497    body; i++;
 498    body; i++;
 499    body; i++;
 500    body; i++;
 501    */
 502 static void
 503 peel_loop_completely (struct loop *loop)
 504 {
 505   sbitmap wont_exit;
 506   unsigned HOST_WIDE_INT npeel;
 507   unsigned i;
 508   VEC (edge, heap) *remove_edges;
 509   edge ein;
 510   struct niter_desc *desc = get_simple_loop_desc (loop);
 511   struct opt_info *opt_info = NULL;
 512
 513   npeel = desc->niter;
 514
 515   if (npeel)
 516     {
 517       bool ok;
 518
 519       wont_exit = sbitmap_alloc (npeel + 1);
 520       sbitmap_ones (wont_exit);
 521       RESET_BIT (wont_exit, 0);
 522       if (desc->noloop_assumptions)
 523         RESET_BIT (wont_exit, 1);
 524
 525       remove_edges = NULL;
 526
 527       if (flag_split_ivs_in_unroller)
 528         opt_info = analyze_insns_in_loop (loop);
 529
 530       opt_info_start_duplication (opt_info);
 531       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 532                                           npeel,
 533                                           wont_exit, desc->out_edge,
 534                                           &remove_edges,
 535                                           DLTHE_FLAG_UPDATE_FREQ
 536                                           | DLTHE_FLAG_COMPLETTE_PEEL
 537                                           | (opt_info
 538                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 539       gcc_assert (ok);
 540
 541       free (wont_exit);
 542
 543       if (opt_info)
 544         {
 545           apply_opt_in_copies (opt_info, npeel, false, true);
 546           free_opt_info (opt_info);
 547         }
 548
 549       /* Remove the exit edges.  */
 550       for (i = 0; VEC_iterate (edge, remove_edges, i, ein); i++)
 551         remove_path (ein);
 552       VEC_free (edge, heap, remove_edges);
 553     }
 554
 555   ein = desc->in_edge;
 556   free_simple_loop_desc (loop);
 557
 558   /* Now remove the unreachable part of the last iteration and cancel
 559      the loop.  */
 560   remove_path (ein);
 561
 562   if (dump_file)
 563     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 564 }
 565
 566 /* Decide whether to unroll LOOP iterating constant number of times
 567    and how much.  */
 568
 569 static void
 570 decide_unroll_constant_iterations (struct loop *loop, int flags)
 571 {
 572   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 573   struct niter_desc *desc;
 574
 575   if (!(flags & UAP_UNROLL))
 576     {
 577       /* We were not asked to, just return back silently.  */
 578       return;
 579     }
 580
 581   if (dump_file)
 582     fprintf (dump_file,
 583              "\n;; Considering unrolling loop with constant "
 584              "number of iterations\n");
 585
 586   /* nunroll = total number of copies of the original loop body in
 587      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 588   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 589   nunroll_by_av
 590     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 591   if (nunroll > nunroll_by_av)
 592     nunroll = nunroll_by_av;
 593   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 594     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 595
 596   /* Skip big loops.  */
 597   if (nunroll <= 1)
 598     {
 599       if (dump_file)
 600         fprintf (dump_file, ";; Not considering loop, is too big\n");
 601       return;
 602     }
 603
 604   /* Check for simple loops.  */
 605   desc = get_simple_loop_desc (loop);
 606
 607   /* Check number of iterations.  */
 608   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 609     {
 610       if (dump_file)
 611         fprintf (dump_file,
 612                  ";; Unable to prove that the loop iterates constant times\n");
 613       return;
 614     }
 615
 616   /* Check whether the loop rolls enough to consider.  */
 617   if (desc->niter < 2 * nunroll)
 618     {
 619       if (dump_file)
 620         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 621       return;
 622     }
 623
 624   /* Success; now compute number of iterations to unroll.  We alter
 625      nunroll so that as few as possible copies of loop body are
 626      necessary, while still not decreasing the number of unrollings
 627      too much (at most by 1).  */
 628   best_copies = 2 * nunroll + 10;
 629
 630   i = 2 * nunroll + 2;
 631   if (i - 1 >= desc->niter)
 632     i = desc->niter - 2;
 633
 634   for (; i >= nunroll - 1; i--)
 635     {
 636       unsigned exit_mod = desc->niter % (i + 1);
 637
 638       if (!loop_exit_at_end_p (loop))
 639         n_copies = exit_mod + i + 1;
 640       else if (exit_mod != (unsigned) i
 641                || desc->noloop_assumptions != NULL_RTX)
 642         n_copies = exit_mod + i + 2;
 643       else
 644         n_copies = i + 1;
 645
 646       if (n_copies < best_copies)
 647         {
 648           best_copies = n_copies;
 649           best_unroll = i;
 650         }
 651     }
 652
 653   if (dump_file)
 654     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 655              best_unroll + 1, best_copies, nunroll);
 656
 657   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 658   loop->lpt_decision.times = best_unroll;
 659
 660   if (dump_file)
 661     fprintf (dump_file,
 662              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 663              loop->lpt_decision.times);
 664 }
 665
 666 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 667    times.  The transformation does this:
 668
 669    for (i = 0; i < 102; i++)
 670      body;
 671
 672    ==>
 673
 674    i = 0;
 675    body; i++;
 676    body; i++;
 677    while (i < 102)
 678      {
 679        body; i++;
 680        body; i++;
 681        body; i++;
 682        body; i++;
 683      }
 684   */
 685 static void
 686 unroll_loop_constant_iterations (struct loop *loop)
 687 {
 688   unsigned HOST_WIDE_INT niter;
 689   unsigned exit_mod;
 690   sbitmap wont_exit;
 691   unsigned i;
 692   VEC (edge, heap) *remove_edges;
 693   edge e;
 694   unsigned max_unroll = loop->lpt_decision.times;
 695   struct niter_desc *desc = get_simple_loop_desc (loop);
 696   bool exit_at_end = loop_exit_at_end_p (loop);
 697   struct opt_info *opt_info = NULL;
 698   bool ok;
 699
 700   niter = desc->niter;
 701
 702   /* Should not get here (such loop should be peeled instead).  */
 703   gcc_assert (niter > max_unroll + 1);
 704
 705   exit_mod = niter % (max_unroll + 1);
 706
 707   wont_exit = sbitmap_alloc (max_unroll + 1);
 708   sbitmap_ones (wont_exit);
 709
 710   remove_edges = NULL;
 711   if (flag_split_ivs_in_unroller
 712       || flag_variable_expansion_in_unroller)
 713     opt_info = analyze_insns_in_loop (loop);
 714
 715   if (!exit_at_end)
 716     {
 717       /* The exit is not at the end of the loop; leave exit test
 718          in the first copy, so that the loops that start with test
 719          of exit condition have continuous body after unrolling.  */
 720
 721       if (dump_file)
 722         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 723
 724       /* Peel exit_mod iterations.  */
 725       RESET_BIT (wont_exit, 0);
 726       if (desc->noloop_assumptions)
 727         RESET_BIT (wont_exit, 1);
 728
 729       if (exit_mod)
 730         {
 731           opt_info_start_duplication (opt_info);
 732           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 733                                               exit_mod,
 734                                               wont_exit, desc->out_edge,
 735                                               &remove_edges,
 736                                               DLTHE_FLAG_UPDATE_FREQ
 737                                               | (opt_info && exit_mod > 1
 738                                                  ? DLTHE_RECORD_COPY_NUMBER
 739                                                    : 0));
 740           gcc_assert (ok);
 741
 742           if (opt_info && exit_mod > 1)
 743             apply_opt_in_copies (opt_info, exit_mod, false, false);
 744
 745           desc->noloop_assumptions = NULL_RTX;
 746           desc->niter -= exit_mod;
 747           desc->niter_max -= exit_mod;
 748         }
 749
 750       SET_BIT (wont_exit, 1);
 751     }
 752   else
 753     {
 754       /* Leave exit test in last copy, for the same reason as above if
 755          the loop tests the condition at the end of loop body.  */
 756
 757       if (dump_file)
 758         fprintf (dump_file, ";; Condition on end of loop.\n");
 759
 760       /* We know that niter >= max_unroll + 2; so we do not need to care of
 761          case when we would exit before reaching the loop.  So just peel
 762          exit_mod + 1 iterations.  */
 763       if (exit_mod != max_unroll
 764           || desc->noloop_assumptions)
 765         {
 766           RESET_BIT (wont_exit, 0);
 767           if (desc->noloop_assumptions)
 768             RESET_BIT (wont_exit, 1);
 769
 770           opt_info_start_duplication (opt_info);
 771           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 772                                               exit_mod + 1,
 773                                               wont_exit, desc->out_edge,
 774                                               &remove_edges,
 775                                               DLTHE_FLAG_UPDATE_FREQ
 776                                               | (opt_info && exit_mod > 0
 777                                                  ? DLTHE_RECORD_COPY_NUMBER
 778                                                    : 0));
 779           gcc_assert (ok);
 780
 781           if (opt_info && exit_mod > 0)
 782             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 783
 784           desc->niter -= exit_mod + 1;
 785           desc->niter_max -= exit_mod + 1;
 786           desc->noloop_assumptions = NULL_RTX;
 787
 788           SET_BIT (wont_exit, 0);
 789           SET_BIT (wont_exit, 1);
 790         }
 791
 792       RESET_BIT (wont_exit, max_unroll);
 793     }
 794
 795   /* Now unroll the loop.  */
 796
 797   opt_info_start_duplication (opt_info);
 798   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 799                                       max_unroll,
 800                                       wont_exit, desc->out_edge,
 801                                       &remove_edges,
 802                                       DLTHE_FLAG_UPDATE_FREQ
 803                                       | (opt_info
 804                                          ? DLTHE_RECORD_COPY_NUMBER
 805                                            : 0));
 806   gcc_assert (ok);
 807
 808   if (opt_info)
 809     {
 810       apply_opt_in_copies (opt_info, max_unroll, true, true);
 811       free_opt_info (opt_info);
 812     }
 813
 814   free (wont_exit);
 815
 816   if (exit_at_end)
 817     {
 818       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 819       /* Find a new in and out edge; they are in the last copy we have made.  */
 820
 821       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 822         {
 823           desc->out_edge = EDGE_SUCC (exit_block, 0);
 824           desc->in_edge = EDGE_SUCC (exit_block, 1);
 825         }
 826       else
 827         {
 828           desc->out_edge = EDGE_SUCC (exit_block, 1);
 829           desc->in_edge = EDGE_SUCC (exit_block, 0);
 830         }
 831     }
 832
 833   desc->niter /= max_unroll + 1;
 834   desc->niter_max /= max_unroll + 1;
 835   desc->niter_expr = GEN_INT (desc->niter);
 836
 837   /* Remove the edges.  */
 838   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
 839     remove_path (e);
 840   VEC_free (edge, heap, remove_edges);
 841
 842   if (dump_file)
 843     fprintf (dump_file,
 844              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 845              max_unroll, num_loop_insns (loop));
 846 }
 847
 848 /* Decide whether to unroll LOOP iterating runtime computable number of times
 849    and how much.  */
 850 static void
 851 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 852 {
 853   unsigned nunroll, nunroll_by_av, i;
 854   struct niter_desc *desc;
 855
 856   if (!(flags & UAP_UNROLL))
 857     {
 858       /* We were not asked to, just return back silently.  */
 859       return;
 860     }
 861
 862   if (dump_file)
 863     fprintf (dump_file,
 864              "\n;; Considering unrolling loop with runtime "
 865              "computable number of iterations\n");
 866
 867   /* nunroll = total number of copies of the original loop body in
 868      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 869   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 870   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 871   if (nunroll > nunroll_by_av)
 872     nunroll = nunroll_by_av;
 873   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 874     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 875
 876   /* Skip big loops.  */
 877   if (nunroll <= 1)
 878     {
 879       if (dump_file)
 880         fprintf (dump_file, ";; Not considering loop, is too big\n");
 881       return;
 882     }
 883
 884   /* Check for simple loops.  */
 885   desc = get_simple_loop_desc (loop);
 886
 887   /* Check simpleness.  */
 888   if (!desc->simple_p || desc->assumptions)
 889     {
 890       if (dump_file)
 891         fprintf (dump_file,
 892                  ";; Unable to prove that the number of iterations "
 893                  "can be counted in runtime\n");
 894       return;
 895     }
 896
 897   if (desc->const_iter)
 898     {
 899       if (dump_file)
 900         fprintf (dump_file, ";; Loop iterates constant times\n");
 901       return;
 902     }
 903
 904   /* If we have profile feedback, check whether the loop rolls.  */
 905   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 906     {
 907       if (dump_file)
 908         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 909       return;
 910     }
 911
 912   /* Success; now force nunroll to be power of 2, as we are unable to
 913      cope with overflows in computation of number of iterations.  */
 914   for (i = 1; 2 * i <= nunroll; i *= 2)
 915     continue;
 916
 917   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 918   loop->lpt_decision.times = i - 1;
 919
 920   if (dump_file)
 921     fprintf (dump_file,
 922              ";; Decided to unroll the runtime computable "
 923              "times rolling loop, %d times.\n",
 924              loop->lpt_decision.times);
 925 }
 926
 927 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 928    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 929    and NULL is returned instead.  */
 930
 931 basic_block
 932 split_edge_and_insert (edge e, rtx insns)
 933 {
 934   basic_block bb;
 935
 936   if (!insns)
 937     return NULL;
 938   bb = split_edge (e);
 939   emit_insn_after (insns, BB_END (bb));
 940
 941   /* ??? We used to assume that INSNS can contain control flow insns, and
 942      that we had to try to find sub basic blocks in BB to maintain a valid
 943      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 944      and call break_superblocks when going out of cfglayout mode.  But it
 945      turns out that this never happens; and that if it does ever happen,
 946      the verify_flow_info call in loop_optimizer_finalize would fail.
 947
 948      There are two reasons why we expected we could have control flow insns
 949      in INSNS.  The first is when a comparison has to be done in parts, and
 950      the second is when the number of iterations is computed for loops with
 951      the number of iterations known at runtime.  In both cases, test cases
 952      to get control flow in INSNS appear to be impossible to construct:
 953
 954       * If do_compare_rtx_and_jump needs several branches to do comparison
 955         in a mode that needs comparison by parts, we cannot analyze the
 956         number of iterations of the loop, and we never get to unrolling it.
 957
 958       * The code in expand_divmod that was suspected to cause creation of
 959         branching code seems to be only accessed for signed division.  The
 960         divisions used by # of iterations analysis are always unsigned.
 961         Problems might arise on architectures that emits branching code
 962         for some operations that may appear in the unroller (especially
 963         for division), but we have no such architectures.
 964
 965      Considering all this, it was decided that we should for now assume
 966      that INSNS can in theory contain control flow insns, but in practice
 967      it never does.  So we don't handle the theoretical case, and should
 968      a real failure ever show up, we have a pretty good clue for how to
 969      fix it.  */
 970
 971   return bb;
 972 }
 973
 974 /* Unroll LOOP for that we are able to count number of iterations in runtime
 975    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 976    extra care for case n < 0):
 977
 978    for (i = 0; i < n; i++)
 979      body;
 980
 981    ==>
 982
 983    i = 0;
 984    mod = n % 4;
 985
 986    switch (mod)
 987      {
 988        case 3:
 989          body; i++;
 990        case 2:
 991          body; i++;
 992        case 1:
 993          body; i++;
 994        case 0: ;
 995      }
 996
 997    while (i < n)
 998      {
 999        body; i++;
1000        body; i++;
1001        body; i++;
1002        body; i++;
1003      }
1004    */
1005 static void
1006 unroll_loop_runtime_iterations (struct loop *loop)
1007 {
1008   rtx old_niter, niter, init_code, branch_code, tmp;
1009   unsigned i, j, p;
1010   basic_block preheader, *body, swtch, ezc_swtch;
1011   VEC (basic_block, heap) *dom_bbs;
1012   sbitmap wont_exit;
1013   int may_exit_copy;
1014   unsigned n_peel;
1015   VEC (edge, heap) *remove_edges;
1016   edge e;
1017   bool extra_zero_check, last_may_exit;
1018   unsigned max_unroll = loop->lpt_decision.times;
1019   struct niter_desc *desc = get_simple_loop_desc (loop);
1020   bool exit_at_end = loop_exit_at_end_p (loop);
1021   struct opt_info *opt_info = NULL;
1022   bool ok;
1023
1024   if (flag_split_ivs_in_unroller
1025       || flag_variable_expansion_in_unroller)
1026     opt_info = analyze_insns_in_loop (loop);
1027
1028   /* Remember blocks whose dominators will have to be updated.  */
1029   dom_bbs = NULL;
1030
1031   body = get_loop_body (loop);
1032   for (i = 0; i < loop->num_nodes; i++)
1033     {
1034       VEC (basic_block, heap) *ldom;
1035       basic_block bb;
1036
1037       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1038       for (j = 0; VEC_iterate (basic_block, ldom, j, bb); j++)
1039         if (!flow_bb_inside_loop_p (loop, bb))
1040           VEC_safe_push (basic_block, heap, dom_bbs, bb);
1041
1042       VEC_free (basic_block, heap, ldom);
1043     }
1044   free (body);
1045
1046   if (!exit_at_end)
1047     {
1048       /* Leave exit in first copy (for explanation why see comment in
1049          unroll_loop_constant_iterations).  */
1050       may_exit_copy = 0;
1051       n_peel = max_unroll - 1;
1052       extra_zero_check = true;
1053       last_may_exit = false;
1054     }
1055   else
1056     {
1057       /* Leave exit in last copy (for explanation why see comment in
1058          unroll_loop_constant_iterations).  */
1059       may_exit_copy = max_unroll;
1060       n_peel = max_unroll;
1061       extra_zero_check = false;
1062       last_may_exit = true;
1063     }
1064
1065   /* Get expression for number of iterations.  */
1066   start_sequence ();
1067   old_niter = niter = gen_reg_rtx (desc->mode);
1068   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1069   if (tmp != niter)
1070     emit_move_insn (niter, tmp);
1071
1072   /* Count modulo by ANDing it with max_unroll; we use the fact that
1073      the number of unrollings is a power of two, and thus this is correct
1074      even if there is overflow in the computation.  */
1075   niter = expand_simple_binop (desc->mode, AND,
1076                                niter,
1077                                GEN_INT (max_unroll),
1078                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1079
1080   init_code = get_insns ();
1081   end_sequence ();
1082   unshare_all_rtl_in_chain (init_code);
1083
1084   /* Precondition the loop.  */
1085   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1086
1087   remove_edges = NULL;
1088
1089   wont_exit = sbitmap_alloc (max_unroll + 2);
1090
1091   /* Peel the first copy of loop body (almost always we must leave exit test
1092      here; the only exception is when we have extra zero check and the number
1093      of iterations is reliable.  Also record the place of (possible) extra
1094      zero check.  */
1095   sbitmap_zero (wont_exit);
1096   if (extra_zero_check
1097       && !desc->noloop_assumptions)
1098     SET_BIT (wont_exit, 1);
1099   ezc_swtch = loop_preheader_edge (loop)->src;
1100   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1101                                       1, wont_exit, desc->out_edge,
1102                                       &remove_edges,
1103                                       DLTHE_FLAG_UPDATE_FREQ);
1104   gcc_assert (ok);
1105
1106   /* Record the place where switch will be built for preconditioning.  */
1107   swtch = split_edge (loop_preheader_edge (loop));
1108
1109   for (i = 0; i < n_peel; i++)
1110     {
1111       /* Peel the copy.  */
1112       sbitmap_zero (wont_exit);
1113       if (i != n_peel - 1 || !last_may_exit)
1114         SET_BIT (wont_exit, 1);
1115       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1116                                           1, wont_exit, desc->out_edge,
1117                                           &remove_edges,
1118                                           DLTHE_FLAG_UPDATE_FREQ);
1119       gcc_assert (ok);
1120
1121       /* Create item for switch.  */
1122       j = n_peel - i - (extra_zero_check ? 0 : 1);
1123       p = REG_BR_PROB_BASE / (i + 2);
1124
1125       preheader = split_edge (loop_preheader_edge (loop));
1126       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1127                                           block_label (preheader), p,
1128                                           NULL_RTX);
1129
1130       /* We rely on the fact that the compare and jump cannot be optimized out,
1131          and hence the cfg we create is correct.  */
1132       gcc_assert (branch_code != NULL_RTX);
1133
1134       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1135       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1136       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1137       e = make_edge (swtch, preheader,
1138                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1139       e->probability = p;
1140     }
1141
1142   if (extra_zero_check)
1143     {
1144       /* Add branch for zero iterations.  */
1145       p = REG_BR_PROB_BASE / (max_unroll + 1);
1146       swtch = ezc_swtch;
1147       preheader = split_edge (loop_preheader_edge (loop));
1148       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1149                                           block_label (preheader), p,
1150                                           NULL_RTX);
1151       gcc_assert (branch_code != NULL_RTX);
1152
1153       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1154       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1155       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1156       e = make_edge (swtch, preheader,
1157                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1158       e->probability = p;
1159     }
1160
1161   /* Recount dominators for outer blocks.  */
1162   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1163
1164   /* And unroll loop.  */
1165
1166   sbitmap_ones (wont_exit);
1167   RESET_BIT (wont_exit, may_exit_copy);
1168   opt_info_start_duplication (opt_info);
1169
1170   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1171                                       max_unroll,
1172                                       wont_exit, desc->out_edge,
1173                                       &remove_edges,
1174                                       DLTHE_FLAG_UPDATE_FREQ
1175                                       | (opt_info
1176                                          ? DLTHE_RECORD_COPY_NUMBER
1177                                            : 0));
1178   gcc_assert (ok);
1179
1180   if (opt_info)
1181     {
1182       apply_opt_in_copies (opt_info, max_unroll, true, true);
1183       free_opt_info (opt_info);
1184     }
1185
1186   free (wont_exit);
1187
1188   if (exit_at_end)
1189     {
1190       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1191       /* Find a new in and out edge; they are in the last copy we have
1192          made.  */
1193
1194       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1195         {
1196           desc->out_edge = EDGE_SUCC (exit_block, 0);
1197           desc->in_edge = EDGE_SUCC (exit_block, 1);
1198         }
1199       else
1200         {
1201           desc->out_edge = EDGE_SUCC (exit_block, 1);
1202           desc->in_edge = EDGE_SUCC (exit_block, 0);
1203         }
1204     }
1205
1206   /* Remove the edges.  */
1207   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
1208     remove_path (e);
1209   VEC_free (edge, heap, remove_edges);
1210
1211   /* We must be careful when updating the number of iterations due to
1212      preconditioning and the fact that the value must be valid at entry
1213      of the loop.  After passing through the above code, we see that
1214      the correct new number of iterations is this:  */
1215   gcc_assert (!desc->const_iter);
1216   desc->niter_expr =
1217     simplify_gen_binary (UDIV, desc->mode, old_niter,
1218                          GEN_INT (max_unroll + 1));
1219   desc->niter_max /= max_unroll + 1;
1220   if (exit_at_end)
1221     {
1222       desc->niter_expr =
1223         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1224       desc->noloop_assumptions = NULL_RTX;
1225       desc->niter_max--;
1226     }
1227
1228   if (dump_file)
1229     fprintf (dump_file,
1230              ";; Unrolled loop %d times, counting # of iterations "
1231              "in runtime, %i insns\n",
1232              max_unroll, num_loop_insns (loop));
1233
1234   VEC_free (basic_block, heap, dom_bbs);
1235 }
1236
1237 /* Decide whether to simply peel LOOP and how much.  */
1238 static void
1239 decide_peel_simple (struct loop *loop, int flags)
1240 {
1241   unsigned npeel;
1242   struct niter_desc *desc;
1243
1244   if (!(flags & UAP_PEEL))
1245     {
1246       /* We were not asked to, just return back silently.  */
1247       return;
1248     }
1249
1250   if (dump_file)
1251     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1252
1253   /* npeel = number of iterations to peel.  */
1254   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1255   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1256     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1257
1258   /* Skip big loops.  */
1259   if (!npeel)
1260     {
1261       if (dump_file)
1262         fprintf (dump_file, ";; Not considering loop, is too big\n");
1263       return;
1264     }
1265
1266   /* Check for simple loops.  */
1267   desc = get_simple_loop_desc (loop);
1268
1269   /* Check number of iterations.  */
1270   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1271     {
1272       if (dump_file)
1273         fprintf (dump_file, ";; Loop iterates constant times\n");
1274       return;
1275     }
1276
1277   /* Do not simply peel loops with branches inside -- it increases number
1278      of mispredicts.  */
1279   if (num_loop_branches (loop) > 1)
1280     {
1281       if (dump_file)
1282         fprintf (dump_file, ";; Not peeling, contains branches\n");
1283       return;
1284     }
1285
1286   if (loop->header->count)
1287     {
1288       unsigned niter = expected_loop_iterations (loop);
1289       if (niter + 1 > npeel)
1290         {
1291           if (dump_file)
1292             {
1293               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1294               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1295                        (HOST_WIDEST_INT) (niter + 1));
1296               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1297                        npeel);
1298             }
1299           return;
1300         }
1301       npeel = niter + 1;
1302     }
1303   else
1304     {
1305       /* For now we have no good heuristics to decide whether loop peeling
1306          will be effective, so disable it.  */
1307       if (dump_file)
1308         fprintf (dump_file,
1309                  ";; Not peeling loop, no evidence it will be profitable\n");
1310       return;
1311     }
1312
1313   /* Success.  */
1314   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1315   loop->lpt_decision.times = npeel;
1316
1317   if (dump_file)
1318     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1319              loop->lpt_decision.times);
1320 }
1321
1322 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1323    while (cond)
1324      body;
1325
1326    ==>
1327
1328    if (!cond) goto end;
1329    body;
1330    if (!cond) goto end;
1331    body;
1332    while (cond)
1333      body;
1334    end: ;
1335    */
1336 static void
1337 peel_loop_simple (struct loop *loop)
1338 {
1339   sbitmap wont_exit;
1340   unsigned npeel = loop->lpt_decision.times;
1341   struct niter_desc *desc = get_simple_loop_desc (loop);
1342   struct opt_info *opt_info = NULL;
1343   bool ok;
1344
1345   if (flag_split_ivs_in_unroller && npeel > 1)
1346     opt_info = analyze_insns_in_loop (loop);
1347
1348   wont_exit = sbitmap_alloc (npeel + 1);
1349   sbitmap_zero (wont_exit);
1350
1351   opt_info_start_duplication (opt_info);
1352
1353   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1354                                       npeel, wont_exit, NULL,
1355                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1356                                       | (opt_info
1357                                          ? DLTHE_RECORD_COPY_NUMBER
1358                                            : 0));
1359   gcc_assert (ok);
1360
1361   free (wont_exit);
1362
1363   if (opt_info)
1364     {
1365       apply_opt_in_copies (opt_info, npeel, false, false);
1366       free_opt_info (opt_info);
1367     }
1368
1369   if (desc->simple_p)
1370     {
1371       if (desc->const_iter)
1372         {
1373           desc->niter -= npeel;
1374           desc->niter_expr = GEN_INT (desc->niter);
1375           desc->noloop_assumptions = NULL_RTX;
1376         }
1377       else
1378         {
1379           /* We cannot just update niter_expr, as its value might be clobbered
1380              inside loop.  We could handle this by counting the number into
1381              temporary just like we do in runtime unrolling, but it does not
1382              seem worthwhile.  */
1383           free_simple_loop_desc (loop);
1384         }
1385     }
1386   if (dump_file)
1387     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1388 }
1389
1390 /* Decide whether to unroll LOOP stupidly and how much.  */
1391 static void
1392 decide_unroll_stupid (struct loop *loop, int flags)
1393 {
1394   unsigned nunroll, nunroll_by_av, i;
1395   struct niter_desc *desc;
1396
1397   if (!(flags & UAP_UNROLL_ALL))
1398     {
1399       /* We were not asked to, just return back silently.  */
1400       return;
1401     }
1402
1403   if (dump_file)
1404     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1405
1406   /* nunroll = total number of copies of the original loop body in
1407      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1408   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1409   nunroll_by_av
1410     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1411   if (nunroll > nunroll_by_av)
1412     nunroll = nunroll_by_av;
1413   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1414     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1415
1416   /* Skip big loops.  */
1417   if (nunroll <= 1)
1418     {
1419       if (dump_file)
1420         fprintf (dump_file, ";; Not considering loop, is too big\n");
1421       return;
1422     }
1423
1424   /* Check for simple loops.  */
1425   desc = get_simple_loop_desc (loop);
1426
1427   /* Check simpleness.  */
1428   if (desc->simple_p && !desc->assumptions)
1429     {
1430       if (dump_file)
1431         fprintf (dump_file, ";; The loop is simple\n");
1432       return;
1433     }
1434
1435   /* Do not unroll loops with branches inside -- it increases number
1436      of mispredicts.  */
1437   if (num_loop_branches (loop) > 1)
1438     {
1439       if (dump_file)
1440         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1441       return;
1442     }
1443
1444   /* If we have profile feedback, check whether the loop rolls.  */
1445   if (loop->header->count
1446       && expected_loop_iterations (loop) < 2 * nunroll)
1447     {
1448       if (dump_file)
1449         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1450       return;
1451     }
1452
1453   /* Success.  Now force nunroll to be power of 2, as it seems that this
1454      improves results (partially because of better alignments, partially
1455      because of some dark magic).  */
1456   for (i = 1; 2 * i <= nunroll; i *= 2)
1457     continue;
1458
1459   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1460   loop->lpt_decision.times = i - 1;
1461
1462   if (dump_file)
1463     fprintf (dump_file,
1464              ";; Decided to unroll the loop stupidly, %d times.\n",
1465              loop->lpt_decision.times);
1466 }
1467
1468 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1469    while (cond)
1470      body;
1471
1472    ==>
1473
1474    while (cond)
1475      {
1476        body;
1477        if (!cond) break;
1478        body;
1479        if (!cond) break;
1480        body;
1481        if (!cond) break;
1482        body;
1483      }
1484    */
1485 static void
1486 unroll_loop_stupid (struct loop *loop)
1487 {
1488   sbitmap wont_exit;
1489   unsigned nunroll = loop->lpt_decision.times;
1490   struct niter_desc *desc = get_simple_loop_desc (loop);
1491   struct opt_info *opt_info = NULL;
1492   bool ok;
1493
1494   if (flag_split_ivs_in_unroller
1495       || flag_variable_expansion_in_unroller)
1496     opt_info = analyze_insns_in_loop (loop);
1497
1498
1499   wont_exit = sbitmap_alloc (nunroll + 1);
1500   sbitmap_zero (wont_exit);
1501   opt_info_start_duplication (opt_info);
1502
1503   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1504                                       nunroll, wont_exit,
1505                                       NULL, NULL,
1506                                       DLTHE_FLAG_UPDATE_FREQ
1507                                       | (opt_info
1508                                          ? DLTHE_RECORD_COPY_NUMBER
1509                                            : 0));
1510   gcc_assert (ok);
1511
1512   if (opt_info)
1513     {
1514       apply_opt_in_copies (opt_info, nunroll, true, true);
1515       free_opt_info (opt_info);
1516     }
1517
1518   free (wont_exit);
1519
1520   if (desc->simple_p)
1521     {
1522       /* We indeed may get here provided that there are nontrivial assumptions
1523          for a loop to be really simple.  We could update the counts, but the
1524          problem is that we are unable to decide which exit will be taken
1525          (not really true in case the number of iterations is constant,
1526          but noone will do anything with this information, so we do not
1527          worry about it).  */
1528       desc->simple_p = false;
1529     }
1530
1531   if (dump_file)
1532     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1533              nunroll, num_loop_insns (loop));
1534 }
1535
1536 /* A hash function for information about insns to split.  */
1537
1538 static hashval_t
1539 si_info_hash (const void *ivts)
1540 {
1541   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1542 }
1543
1544 /* An equality functions for information about insns to split.  */
1545
1546 static int
1547 si_info_eq (const void *ivts1, const void *ivts2)
1548 {
1549   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1550   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1551
1552   return i1->insn == i2->insn;
1553 }
1554
1555 /* Return a hash for VES, which is really a "var_to_expand *".  */
1556
1557 static hashval_t
1558 ve_info_hash (const void *ves)
1559 {
1560   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1561 }
1562
1563 /* Return true if IVTS1 and IVTS2 (which are really both of type
1564    "var_to_expand *") refer to the same instruction.  */
1565
1566 static int
1567 ve_info_eq (const void *ivts1, const void *ivts2)
1568 {
1569   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1570   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1571
1572   return i1->insn == i2->insn;
1573 }
1574
1575 /* Returns true if REG is referenced in one insn in LOOP.  */
1576
1577 bool
1578 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg)
1579 {
1580   basic_block *body, bb;
1581   unsigned i;
1582   int count_ref = 0;
1583   rtx insn;
1584
1585   body = get_loop_body (loop);
1586   for (i = 0; i < loop->num_nodes; i++)
1587     {
1588       bb = body[i];
1589
1590       FOR_BB_INSNS (bb, insn)
1591       {
1592         if (rtx_referenced_p (reg, insn))
1593           count_ref++;
1594       }
1595     }
1596   return (count_ref  == 1);
1597 }
1598
1599 /* Determine whether INSN contains an accumulator
1600    which can be expanded into separate copies,
1601    one for each copy of the LOOP body.
1602
1603    for (i = 0 ; i < n; i++)
1604      sum += a[i];
1605
1606    ==>
1607
1608    sum += a[i]
1609    ....
1610    i = i+1;
1611    sum1 += a[i]
1612    ....
1613    i = i+1
1614    sum2 += a[i];
1615    ....
1616
1617    Return NULL if INSN contains no opportunity for expansion of accumulator.
1618    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1619    information and return a pointer to it.
1620 */
1621
1622 static struct var_to_expand *
1623 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1624 {
1625   rtx set, dest, src, op1, op2, something;
1626   struct var_to_expand *ves;
1627   enum machine_mode mode1, mode2;
1628   unsigned accum_pos;
1629
1630   set = single_set (insn);
1631   if (!set)
1632     return NULL;
1633
1634   dest = SET_DEST (set);
1635   src = SET_SRC (set);
1636
1637   if (GET_CODE (src) != PLUS
1638       && GET_CODE (src) != MINUS
1639       && GET_CODE (src) != MULT)
1640     return NULL;
1641
1642   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1643      in MD.  But if there is no optab to generate the insn, we can not
1644      perform the variable expansion.  This can happen if an MD provides
1645      an insn but not a named pattern to generate it, for example to avoid
1646      producing code that needs additional mode switches like for x87/mmx.
1647
1648      So we check have_insn_for which looks for an optab for the operation
1649      in SRC.  If it doesn't exist, we can't perform the expansion even
1650      though INSN is valid.  */
1651   if (!have_insn_for (GET_CODE (src), GET_MODE (src)))
1652     return NULL;
1653
1654   op1 = XEXP (src, 0);
1655   op2 = XEXP (src, 1);
1656
1657   if (!REG_P (dest)
1658       && !(GET_CODE (dest) == SUBREG
1659            && REG_P (SUBREG_REG (dest))))
1660     return NULL;
1661
1662   if (rtx_equal_p (dest, op1))
1663     accum_pos = 0;
1664   else if (rtx_equal_p (dest, op2))
1665     accum_pos = 1;
1666   else
1667     return NULL;
1668
1669   /* The method of expansion that we are using; which includes
1670      the initialization of the expansions with zero and the summation of
1671      the expansions at the end of the computation will yield wrong results
1672      for (x = something - x) thus avoid using it in that case.  */
1673   if (accum_pos == 1
1674     && GET_CODE (src) == MINUS)
1675    return NULL;
1676
1677   something = (accum_pos == 0)? op2 : op1;
1678
1679   if (!referenced_in_one_insn_in_loop_p (loop, dest))
1680     return NULL;
1681
1682   if (rtx_referenced_p (dest, something))
1683     return NULL;
1684
1685   mode1 = GET_MODE (dest);
1686   mode2 = GET_MODE (something);
1687   if ((FLOAT_MODE_P (mode1)
1688        || FLOAT_MODE_P (mode2))
1689       && !flag_associative_math)
1690     return NULL;
1691
1692   if (dump_file)
1693   {
1694     fprintf (dump_file,
1695     "\n;; Expanding Accumulator ");
1696     print_rtl (dump_file, dest);
1697     fprintf (dump_file, "\n");
1698   }
1699
1700   /* Record the accumulator to expand.  */
1701   ves = XNEW (struct var_to_expand);
1702   ves->insn = insn;
1703   ves->reg = copy_rtx (dest);
1704   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1705   ves->next = NULL;
1706   ves->op = GET_CODE (src);
1707   ves->expansion_count = 0;
1708   ves->reuse_expansion = 0;
1709   ves->accum_pos = accum_pos;
1710   return ves;
1711 }
1712
1713 /* Determine whether there is an induction variable in INSN that
1714    we would like to split during unrolling.
1715
1716    I.e. replace
1717
1718    i = i + 1;
1719    ...
1720    i = i + 1;
1721    ...
1722    i = i + 1;
1723    ...
1724
1725    type chains by
1726
1727    i0 = i + 1
1728    ...
1729    i = i0 + 1
1730    ...
1731    i = i0 + 2
1732    ...
1733
1734    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1735    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1736    pointer to it.  */
1737
1738 static struct iv_to_split *
1739 analyze_iv_to_split_insn (rtx insn)
1740 {
1741   rtx set, dest;
1742   struct rtx_iv iv;
1743   struct iv_to_split *ivts;
1744   bool ok;
1745
1746   /* For now we just split the basic induction variables.  Later this may be
1747      extended for example by selecting also addresses of memory references.  */
1748   set = single_set (insn);
1749   if (!set)
1750     return NULL;
1751
1752   dest = SET_DEST (set);
1753   if (!REG_P (dest))
1754     return NULL;
1755
1756   if (!biv_p (insn, dest))
1757     return NULL;
1758
1759   ok = iv_analyze_result (insn, dest, &iv);
1760
1761   /* This used to be an assert under the assumption that if biv_p returns
1762      true that iv_analyze_result must also return true.  However, that
1763      assumption is not strictly correct as evidenced by pr25569.
1764
1765      Returning NULL when iv_analyze_result returns false is safe and
1766      avoids the problems in pr25569 until the iv_analyze_* routines
1767      can be fixed, which is apparently hard and time consuming
1768      according to their author.  */
1769   if (! ok)
1770     return NULL;
1771
1772   if (iv.step == const0_rtx
1773       || iv.mode != iv.extend_mode)
1774     return NULL;
1775
1776   /* Record the insn to split.  */
1777   ivts = XNEW (struct iv_to_split);
1778   ivts->insn = insn;
1779   ivts->base_var = NULL_RTX;
1780   ivts->step = iv.step;
1781   ivts->next = NULL;
1782   ivts->n_loc = 1;
1783   ivts->loc[0] = 1;
1784
1785   return ivts;
1786 }
1787
1788 /* Determines which of insns in LOOP can be optimized.
1789    Return a OPT_INFO struct with the relevant hash tables filled
1790    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1791    is undefined for the return value.  */
1792
1793 static struct opt_info *
1794 analyze_insns_in_loop (struct loop *loop)
1795 {
1796   basic_block *body, bb;
1797   unsigned i;
1798   struct opt_info *opt_info = XCNEW (struct opt_info);
1799   rtx insn;
1800   struct iv_to_split *ivts = NULL;
1801   struct var_to_expand *ves = NULL;
1802   PTR *slot1;
1803   PTR *slot2;
1804   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1805   edge exit;
1806   bool can_apply = false;
1807
1808   iv_analysis_loop_init (loop);
1809
1810   body = get_loop_body (loop);
1811
1812   if (flag_split_ivs_in_unroller)
1813     {
1814       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1815                                               si_info_hash, si_info_eq, free);
1816       opt_info->iv_to_split_head = NULL;
1817       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1818     }
1819
1820   /* Record the loop exit bb and loop preheader before the unrolling.  */
1821   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1822
1823   if (VEC_length (edge, edges) == 1)
1824     {
1825       exit = VEC_index (edge, edges, 0);
1826       if (!(exit->flags & EDGE_COMPLEX))
1827         {
1828           opt_info->loop_exit = split_edge (exit);
1829           can_apply = true;
1830         }
1831     }
1832
1833   if (flag_variable_expansion_in_unroller
1834       && can_apply)
1835     {
1836       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1837                                                         ve_info_hash,
1838                                                         ve_info_eq, free);
1839       opt_info->var_to_expand_head = NULL;
1840       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1841     }
1842
1843   for (i = 0; i < loop->num_nodes; i++)
1844     {
1845       bb = body[i];
1846       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1847         continue;
1848
1849       FOR_BB_INSNS (bb, insn)
1850       {
1851         if (!INSN_P (insn))
1852           continue;
1853
1854         if (opt_info->insns_to_split)
1855           ivts = analyze_iv_to_split_insn (insn);
1856
1857         if (ivts)
1858           {
1859             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1860             gcc_assert (*slot1 == NULL);
1861             *slot1 = ivts;
1862             *opt_info->iv_to_split_tail = ivts;
1863             opt_info->iv_to_split_tail = &ivts->next;
1864             continue;
1865           }
1866
1867         if (opt_info->insns_with_var_to_expand)
1868           ves = analyze_insn_to_expand_var (loop, insn);
1869
1870         if (ves)
1871           {
1872             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1873             gcc_assert (*slot2 == NULL);
1874             *slot2 = ves;
1875             *opt_info->var_to_expand_tail = ves;
1876             opt_info->var_to_expand_tail = &ves->next;
1877           }
1878       }
1879     }
1880
1881   VEC_free (edge, heap, edges);
1882   free (body);
1883   return opt_info;
1884 }
1885
1886 /* Called just before loop duplication.  Records start of duplicated area
1887    to OPT_INFO.  */
1888
1889 static void
1890 opt_info_start_duplication (struct opt_info *opt_info)
1891 {
1892   if (opt_info)
1893     opt_info->first_new_block = last_basic_block;
1894 }
1895
1896 /* Determine the number of iterations between initialization of the base
1897    variable and the current copy (N_COPY).  N_COPIES is the total number
1898    of newly created copies.  UNROLLING is true if we are unrolling
1899    (not peeling) the loop.  */
1900
1901 static unsigned
1902 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1903 {
1904   if (unrolling)
1905     {
1906       /* If we are unrolling, initialization is done in the original loop
1907          body (number 0).  */
1908       return n_copy;
1909     }
1910   else
1911     {
1912       /* If we are peeling, the copy in that the initialization occurs has
1913          number 1.  The original loop (number 0) is the last.  */
1914       if (n_copy)
1915         return n_copy - 1;
1916       else
1917         return n_copies;
1918     }
1919 }
1920
1921 /* Locate in EXPR the expression corresponding to the location recorded
1922    in IVTS, and return a pointer to the RTX for this location.  */
1923
1924 static rtx *
1925 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1926 {
1927   unsigned i;
1928   rtx *ret = &expr;
1929
1930   for (i = 0; i < ivts->n_loc; i++)
1931     ret = &XEXP (*ret, ivts->loc[i]);
1932
1933   return ret;
1934 }
1935
1936 /* Allocate basic variable for the induction variable chain.  */
1937
1938 static void
1939 allocate_basic_variable (struct iv_to_split *ivts)
1940 {
1941   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1942
1943   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1944 }
1945
1946 /* Insert initialization of basic variable of IVTS before INSN, taking
1947    the initial value from INSN.  */
1948
1949 static void
1950 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1951 {
1952   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1953   rtx seq;
1954
1955   start_sequence ();
1956   expr = force_operand (expr, ivts->base_var);
1957   if (expr != ivts->base_var)
1958     emit_move_insn (ivts->base_var, expr);
1959   seq = get_insns ();
1960   end_sequence ();
1961
1962   emit_insn_before (seq, insn);
1963 }
1964
1965 /* Replace the use of induction variable described in IVTS in INSN
1966    by base variable + DELTA * step.  */
1967
1968 static void
1969 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1970 {
1971   rtx expr, *loc, seq, incr, var;
1972   enum machine_mode mode = GET_MODE (ivts->base_var);
1973   rtx src, dest, set;
1974
1975   /* Construct base + DELTA * step.  */
1976   if (!delta)
1977     expr = ivts->base_var;
1978   else
1979     {
1980       incr = simplify_gen_binary (MULT, mode,
1981                                   ivts->step, gen_int_mode (delta, mode));
1982       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1983                                   ivts->base_var, incr);
1984     }
1985
1986   /* Figure out where to do the replacement.  */
1987   loc = get_ivts_expr (single_set (insn), ivts);
1988
1989   /* If we can make the replacement right away, we're done.  */
1990   if (validate_change (insn, loc, expr, 0))
1991     return;
1992
1993   /* Otherwise, force EXPR into a register and try again.  */
1994   start_sequence ();
1995   var = gen_reg_rtx (mode);
1996   expr = force_operand (expr, var);
1997   if (expr != var)
1998     emit_move_insn (var, expr);
1999   seq = get_insns ();
2000   end_sequence ();
2001   emit_insn_before (seq, insn);
2002
2003   if (validate_change (insn, loc, var, 0))
2004     return;
2005
2006   /* The last chance.  Try recreating the assignment in insn
2007      completely from scratch.  */
2008   set = single_set (insn);
2009   gcc_assert (set);
2010
2011   start_sequence ();
2012   *loc = var;
2013   src = copy_rtx (SET_SRC (set));
2014   dest = copy_rtx (SET_DEST (set));
2015   src = force_operand (src, dest);
2016   if (src != dest)
2017     emit_move_insn (dest, src);
2018   seq = get_insns ();
2019   end_sequence ();
2020
2021   emit_insn_before (seq, insn);
2022   delete_insn (insn);
2023 }
2024
2025
2026 /* Return one expansion of the accumulator recorded in struct VE.  */
2027
2028 static rtx
2029 get_expansion (struct var_to_expand *ve)
2030 {
2031   rtx reg;
2032
2033   if (ve->reuse_expansion == 0)
2034     reg = ve->reg;
2035   else
2036     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2037
2038   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2039     ve->reuse_expansion = 0;
2040   else
2041     ve->reuse_expansion++;
2042
2043   return reg;
2044 }
2045
2046
2047 /* Given INSN replace the uses of the accumulator recorded in VE
2048    with a new register.  */
2049
2050 static void
2051 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2052 {
2053   rtx new_reg, set;
2054   bool really_new_expansion = false;
2055
2056   set = single_set (insn);
2057   gcc_assert (set);
2058
2059   /* Generate a new register only if the expansion limit has not been
2060      reached.  Else reuse an already existing expansion.  */
2061   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2062     {
2063       really_new_expansion = true;
2064       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2065     }
2066   else
2067     new_reg = get_expansion (ve);
2068
2069   validate_change (insn, &SET_DEST (set), new_reg, 1);
2070   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2071
2072   if (apply_change_group ())
2073     if (really_new_expansion)
2074       {
2075         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2076         ve->expansion_count++;
2077       }
2078 }
2079
2080 /* Initialize the variable expansions in loop preheader.  PLACE is the
2081    loop-preheader basic block where the initialization of the
2082    expansions should take place.  The expansions are initialized with
2083    (-0) when the operation is plus or minus to honor sign zero.  This
2084    way we can prevent cases where the sign of the final result is
2085    effected by the sign of the expansion.  Here is an example to
2086    demonstrate this:
2087
2088    for (i = 0 ; i < n; i++)
2089      sum += something;
2090
2091    ==>
2092
2093    sum += something
2094    ....
2095    i = i+1;
2096    sum1 += something
2097    ....
2098    i = i+1
2099    sum2 += something;
2100    ....
2101
2102    When SUM is initialized with -zero and SOMETHING is also -zero; the
2103    final result of sum should be -zero thus the expansions sum1 and sum2
2104    should be initialized with -zero as well (otherwise we will get +zero
2105    as the final result).  */
2106
2107 static void
2108 insert_var_expansion_initialization (struct var_to_expand *ve,
2109                                      basic_block place)
2110 {
2111   rtx seq, var, zero_init, insn;
2112   unsigned i;
2113   enum machine_mode mode = GET_MODE (ve->reg);
2114   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2115
2116   if (VEC_length (rtx, ve->var_expansions) == 0)
2117     return;
2118
2119   start_sequence ();
2120   if (ve->op == PLUS || ve->op == MINUS)
2121     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2122       {
2123         if (honor_signed_zero_p)
2124           zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2125         else
2126           zero_init = CONST0_RTX (mode);
2127
2128         emit_move_insn (var, zero_init);
2129       }
2130   else if (ve->op == MULT)
2131     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2132       {
2133         zero_init =  CONST1_RTX (GET_MODE (var));
2134         emit_move_insn (var, zero_init);
2135       }
2136
2137   seq = get_insns ();
2138   end_sequence ();
2139
2140   insn = BB_HEAD (place);
2141   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2142     insn = NEXT_INSN (insn);
2143
2144   emit_insn_after (seq, insn);
2145 }
2146
2147 /* Combine the variable expansions at the loop exit.  PLACE is the
2148    loop exit basic block where the summation of the expansions should
2149    take place.  */
2150
2151 static void
2152 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2153 {
2154   rtx sum = ve->reg;
2155   rtx expr, seq, var, insn;
2156   unsigned i;
2157
2158   if (VEC_length (rtx, ve->var_expansions) == 0)
2159     return;
2160
2161   start_sequence ();
2162   if (ve->op == PLUS || ve->op == MINUS)
2163     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2164       {
2165         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg),
2166                                    var, sum);
2167       }
2168   else if (ve->op == MULT)
2169     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2170       {
2171         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg),
2172                                    var, sum);
2173       }
2174
2175   expr = force_operand (sum, ve->reg);
2176   if (expr != ve->reg)
2177     emit_move_insn (ve->reg, expr);
2178   seq = get_insns ();
2179   end_sequence ();
2180
2181   insn = BB_HEAD (place);
2182   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2183     insn = NEXT_INSN (insn);
2184
2185   emit_insn_after (seq, insn);
2186 }
2187
2188 /* Apply loop optimizations in loop copies using the
2189    data which gathered during the unrolling.  Structure
2190    OPT_INFO record that data.
2191
2192    UNROLLING is true if we unrolled (not peeled) the loop.
2193    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2194    the loop (as it should happen in complete unrolling, but not in ordinary
2195    peeling of the loop).  */
2196
2197 static void
2198 apply_opt_in_copies (struct opt_info *opt_info,
2199                      unsigned n_copies, bool unrolling,
2200                      bool rewrite_original_loop)
2201 {
2202   unsigned i, delta;
2203   basic_block bb, orig_bb;
2204   rtx insn, orig_insn, next;
2205   struct iv_to_split ivts_templ, *ivts;
2206   struct var_to_expand ve_templ, *ves;
2207
2208   /* Sanity check -- we need to put initialization in the original loop
2209      body.  */
2210   gcc_assert (!unrolling || rewrite_original_loop);
2211
2212   /* Allocate the basic variables (i0).  */
2213   if (opt_info->insns_to_split)
2214     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2215       allocate_basic_variable (ivts);
2216
2217   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2218     {
2219       bb = BASIC_BLOCK (i);
2220       orig_bb = get_bb_original (bb);
2221
2222       /* bb->aux holds position in copy sequence initialized by
2223          duplicate_loop_to_header_edge.  */
2224       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2225                                         unrolling);
2226       bb->aux = 0;
2227       orig_insn = BB_HEAD (orig_bb);
2228       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2229         {
2230           next = NEXT_INSN (insn);
2231           if (!INSN_P (insn))
2232             continue;
2233
2234           while (!INSN_P (orig_insn))
2235             orig_insn = NEXT_INSN (orig_insn);
2236
2237           ivts_templ.insn = orig_insn;
2238           ve_templ.insn = orig_insn;
2239
2240           /* Apply splitting iv optimization.  */
2241           if (opt_info->insns_to_split)
2242             {
2243               ivts = (struct iv_to_split *)
2244                 htab_find (opt_info->insns_to_split, &ivts_templ);
2245
2246               if (ivts)
2247                 {
2248                   gcc_assert (GET_CODE (PATTERN (insn))
2249                               == GET_CODE (PATTERN (orig_insn)));
2250
2251                   if (!delta)
2252                     insert_base_initialization (ivts, insn);
2253                   split_iv (ivts, insn, delta);
2254                 }
2255             }
2256           /* Apply variable expansion optimization.  */
2257           if (unrolling && opt_info->insns_with_var_to_expand)
2258             {
2259               ves = (struct var_to_expand *)
2260                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2261               if (ves)
2262                 {
2263                   gcc_assert (GET_CODE (PATTERN (insn))
2264                               == GET_CODE (PATTERN (orig_insn)));
2265                   expand_var_during_unrolling (ves, insn);
2266                 }
2267             }
2268           orig_insn = NEXT_INSN (orig_insn);
2269         }
2270     }
2271
2272   if (!rewrite_original_loop)
2273     return;
2274
2275   /* Initialize the variable expansions in the loop preheader
2276      and take care of combining them at the loop exit.  */
2277   if (opt_info->insns_with_var_to_expand)
2278     {
2279       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2280         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2281       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2282         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2283     }
2284
2285   /* Rewrite also the original loop body.  Find them as originals of the blocks
2286      in the last copied iteration, i.e. those that have
2287      get_bb_copy (get_bb_original (bb)) == bb.  */
2288   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2289     {
2290       bb = BASIC_BLOCK (i);
2291       orig_bb = get_bb_original (bb);
2292       if (get_bb_copy (orig_bb) != bb)
2293         continue;
2294
2295       delta = determine_split_iv_delta (0, n_copies, unrolling);
2296       for (orig_insn = BB_HEAD (orig_bb);
2297            orig_insn != NEXT_INSN (BB_END (bb));
2298            orig_insn = next)
2299         {
2300           next = NEXT_INSN (orig_insn);
2301
2302           if (!INSN_P (orig_insn))
2303             continue;
2304
2305           ivts_templ.insn = orig_insn;
2306           if (opt_info->insns_to_split)
2307             {
2308               ivts = (struct iv_to_split *)
2309                 htab_find (opt_info->insns_to_split, &ivts_templ);
2310               if (ivts)
2311                 {
2312                   if (!delta)
2313                     insert_base_initialization (ivts, orig_insn);
2314                   split_iv (ivts, orig_insn, delta);
2315                   continue;
2316                 }
2317             }
2318
2319         }
2320     }
2321 }
2322
2323 /* Release OPT_INFO.  */
2324
2325 static void
2326 free_opt_info (struct opt_info *opt_info)
2327 {
2328   if (opt_info->insns_to_split)
2329     htab_delete (opt_info->insns_to_split);
2330   if (opt_info->insns_with_var_to_expand)
2331     {
2332       struct var_to_expand *ves;
2333
2334       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2335         VEC_free (rtx, heap, ves->var_expansions);
2336       htab_delete (opt_info->insns_with_var_to_expand);
2337     }
2338   free (opt_info);
2339 }