gcc/gimple-loop-versioning.cc

   1 /* Loop versioning pass.
   2    Copyright (C) 2018-2020 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "backend.h"
  24 #include "tree.h"
  25 #include "gimple.h"
  26 #include "gimple-iterator.h"
  27 #include "tree-pass.h"
  28 #include "gimplify-me.h"
  29 #include "cfgloop.h"
  30 #include "tree-ssa-loop.h"
  31 #include "ssa.h"
  32 #include "tree-scalar-evolution.h"
  33 #include "tree-chrec.h"
  34 #include "tree-ssa-loop-ivopts.h"
  35 #include "fold-const.h"
  36 #include "tree-ssa-propagate.h"
  37 #include "tree-inline.h"
  38 #include "domwalk.h"
  39 #include "alloc-pool.h"
  40 #include "vr-values.h"
  41 #include "gimple-ssa-evrp-analyze.h"
  42 #include "tree-vectorizer.h"
  43 #include "omp-general.h"
  44 #include "predict.h"
  45 #include "tree-into-ssa.h"
  46
  47 namespace {
  48
  49 /* This pass looks for loops that could be simplified if certain loop
  50    invariant conditions were true.  It is effectively a form of loop
  51    splitting in which the pass produces the split conditions itself,
  52    instead of using ones that are already present in the IL.
  53
  54    Versioning for when strides are 1
  55    ---------------------------------
  56
  57    At the moment the only thing the pass looks for are memory references
  58    like:
  59
  60      for (auto i : ...)
  61        ...x[i * stride]...
  62
  63    It considers changing such loops to:
  64
  65      if (stride == 1)
  66        for (auto i : ...)    [A]
  67          ...x[i]...
  68      else
  69        for (auto i : ...)    [B]
  70          ...x[i * stride]...
  71
  72    This can have several benefits:
  73
  74    (1) [A] is often easier or cheaper to vectorize than [B].
  75
  76    (2) The scalar code in [A] is simpler than the scalar code in [B]
  77        (if the loops cannot be vectorized or need an epilogue loop).
  78
  79    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
  80
  81    (4) [A] has simpler address evolutions, which can help other passes
  82        like loop interchange.
  83
  84    The optimization is particularly useful for assumed-shape arrays in
  85    Fortran, where the stride of the innermost dimension depends on the
  86    array descriptor but is often equal to 1 in practice.  For example:
  87
  88      subroutine f1(x)
  89        real :: x(:)
  90        x(:) = 100
  91      end subroutine f1
  92
  93    generates the equivalent of:
  94
  95      raw_stride = *x.dim[0].stride;
  96      stride = raw_stride != 0 ? raw_stride : 1;
  97      x_base = *x.data;
  98      ...
  99      tmp1 = stride * S;
 100      tmp2 = tmp1 - stride;
 101      *x_base[tmp2] = 1.0e+2;
 102
 103    but in the common case that stride == 1, the last three statements
 104    simplify to:
 105
 106      tmp3 = S + -1;
 107      *x_base[tmp3] = 1.0e+2;
 108
 109    The optimization is in principle very simple.  The difficult parts are:
 110
 111    (a) deciding which parts of a general address calculation correspond
 112        to the inner dimension of an array, since this usually isn't explicit
 113        in the IL, and for C often isn't even explicit in the source code
 114
 115    (b) estimating when the transformation is worthwhile
 116
 117    Structure
 118    ---------
 119
 120    The pass has four phases:
 121
 122    (1) Walk through the statements looking for and recording potential
 123        versioning opportunities.  Stop if there are none.
 124
 125    (2) Use context-sensitive range information to see whether any versioning
 126        conditions are impossible in practice.  Remove them if so, and stop
 127        if no opportunities remain.
 128
 129        (We do this only after (1) to keep compile time down when no
 130        versioning opportunities exist.)
 131
 132    (3) Apply the cost model.  Decide which versioning opportunities are
 133        worthwhile and at which nesting level they should be applied.
 134
 135    (4) Attempt to version all the loops selected by (3), so that:
 136
 137          for (...)
 138            ...
 139
 140        becomes:
 141
 142          if (!cond)
 143            for (...) // Original loop
 144              ...
 145          else
 146            for (...) // New loop
 147              ...
 148
 149        Use the version condition COND to simplify the new loop.  */
 150
 151 /* Enumerates the likelihood that a particular value indexes the inner
 152    dimension of an array.  */
 153 enum inner_likelihood {
 154   INNER_UNLIKELY,
 155   INNER_DONT_KNOW,
 156   INNER_LIKELY
 157 };
 158
 159 /* Information about one term of an address_info.  */
 160 struct address_term_info
 161 {
 162   /* The value of the term is EXPR * MULTIPLIER.  */
 163   tree expr;
 164   unsigned HOST_WIDE_INT multiplier;
 165
 166   /* The stride applied by EXPR in each iteration of some unrecorded loop,
 167      or null if no stride has been identified.  */
 168   tree stride;
 169
 170   /* Enumerates the likelihood that EXPR indexes the inner dimension
 171      of an array.  */
 172   enum inner_likelihood inner_likelihood;
 173
 174   /* True if STRIDE == 1 is a versioning opportunity when considered
 175      in isolation.  */
 176   bool versioning_opportunity_p;
 177 };
 178
 179 /* Information about an address calculation, and the range of constant
 180    offsets applied to it.  */
 181 class address_info
 182 {
 183 public:
 184   static const unsigned int MAX_TERMS = 8;
 185
 186   /* One statement that calculates the address.  If multiple statements
 187      share the same address, we only record the first.  */
 188   gimple *stmt;
 189
 190   /* The loop containing STMT (cached for convenience).  If multiple
 191      statements share the same address, they all belong to this loop.  */
 192   class loop *loop;
 193
 194   /* A decomposition of the calculation into a sum of terms plus an
 195      optional base.  When BASE is provided, it is never an SSA name.
 196      Once initialization is complete, all members of TERMs are SSA names.  */
 197   tree base;
 198   auto_vec<address_term_info, MAX_TERMS> terms;
 199
 200   /* All bytes accessed from the address fall in the offset range
 201      [MIN_OFFSET, MAX_OFFSET).  */
 202   HOST_WIDE_INT min_offset, max_offset;
 203 };
 204
 205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
 206 struct address_info_hasher : nofree_ptr_hash <address_info>
 207 {
 208   static hashval_t hash (const address_info *);
 209   static bool equal (const address_info *, const address_info *);
 210 };
 211
 212 /* Information about the versioning we'd like to apply to a loop.  */
 213 class loop_info
 214 {
 215 public:
 216   bool worth_versioning_p () const;
 217
 218   /* True if we've decided not to version this loop.  The remaining
 219      fields are meaningless if so.  */
 220   bool rejected_p;
 221
 222   /* True if at least one subloop of this loop benefits from versioning.  */
 223   bool subloops_benefit_p;
 224
 225   /* An estimate of the total number of instructions in the loop,
 226      excluding those in subloops that benefit from versioning.  */
 227   unsigned int num_insns;
 228
 229   /* The outermost loop that can handle all the version checks
 230      described below.  */
 231   class loop *outermost;
 232
 233   /* The first entry in the list of blocks that belong to this loop
 234      (and not to subloops).  m_next_block_in_loop provides the chain
 235      pointers for the list.  */
 236   basic_block block_list;
 237
 238   /* We'd like to version the loop for the case in which these SSA names
 239      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
 240   bitmap_head unity_names;
 241
 242   /* If versioning succeeds, this points the version of the loop that
 243      assumes the version conditions holds.  */
 244   class loop *optimized_loop;
 245 };
 246
 247 /* The main pass structure.  */
 248 class loop_versioning
 249 {
 250 public:
 251   loop_versioning (function *);
 252   ~loop_versioning ();
 253   unsigned int run ();
 254
 255 private:
 256   /* Used to walk the dominator tree to find loop versioning conditions
 257      that are always false.  */
 258   class lv_dom_walker : public dom_walker
 259   {
 260   public:
 261     lv_dom_walker (loop_versioning &);
 262
 263     edge before_dom_children (basic_block) FINAL OVERRIDE;
 264     void after_dom_children (basic_block) FINAL OVERRIDE;
 265
 266   private:
 267     /* The parent pass.  */
 268     loop_versioning &m_lv;
 269
 270     /* Used to build context-dependent range information.  */
 271     evrp_range_analyzer m_range_analyzer;
 272   };
 273
 274   /* Used to simplify statements based on conditions that are established
 275      by the version checks.  */
 276   class name_prop : public substitute_and_fold_engine
 277   {
 278   public:
 279     name_prop (loop_info &li) : m_li (li) {}
 280     tree get_value (tree, gimple *) FINAL OVERRIDE;
 281
 282   private:
 283     /* Information about the versioning we've performed on the loop.  */
 284     loop_info &m_li;
 285   };
 286
 287   loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
 288
 289   unsigned int max_insns_for_loop (class loop *);
 290   bool expensive_stmt_p (gimple *);
 291
 292   void version_for_unity (gimple *, tree);
 293   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
 294                                 unsigned HOST_WIDE_INT * = 0);
 295   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
 296   bool multiply_term_by (address_term_info &, tree);
 297   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
 298   void dump_inner_likelihood (address_info &, address_term_info &);
 299   void analyze_stride (address_info &, address_term_info &,
 300                        tree, class loop *);
 301   bool find_per_loop_multiplication (address_info &, address_term_info &);
 302   bool analyze_term_using_scevs (address_info &, address_term_info &);
 303   void analyze_arbitrary_term (address_info &, address_term_info &);
 304   void analyze_address_fragment (address_info &);
 305   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
 306                                 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
 307   void analyze_expr (gimple *, tree);
 308   bool analyze_block (basic_block);
 309   bool analyze_blocks ();
 310
 311   void prune_loop_conditions (class loop *, vr_values *);
 312   bool prune_conditions ();
 313
 314   void merge_loop_info (class loop *, class loop *);
 315   void add_loop_to_queue (class loop *);
 316   bool decide_whether_loop_is_versionable (class loop *);
 317   bool make_versioning_decisions ();
 318
 319   bool version_loop (class loop *);
 320   void implement_versioning_decisions ();
 321
 322   /* The function we're optimizing.  */
 323   function *m_fn;
 324
 325   /* The obstack to use for all pass-specific bitmaps.  */
 326   bitmap_obstack m_bitmap_obstack;
 327
 328   /* An obstack to use for general allocation.  */
 329   obstack m_obstack;
 330
 331   /* The number of loops in the function.  */
 332   unsigned int m_nloops;
 333
 334   /* The total number of loop version conditions we've found.  */
 335   unsigned int m_num_conditions;
 336
 337   /* Assume that an address fragment of the form i * stride * scale
 338      (for variable stride and constant scale) will not benefit from
 339      versioning for stride == 1 when scale is greater than this value.  */
 340   unsigned HOST_WIDE_INT m_maximum_scale;
 341
 342   /* Information about each loop.  */
 343   auto_vec<loop_info> m_loops;
 344
 345   /* Used to form a linked list of blocks that belong to a loop,
 346      started by loop_info::block_list.  */
 347   auto_vec<basic_block> m_next_block_in_loop;
 348
 349   /* The list of loops that we've decided to version.  */
 350   auto_vec<class loop *> m_loops_to_version;
 351
 352   /* A table of addresses in the current loop, keyed off their values
 353      but not their offsets.  */
 354   hash_table <address_info_hasher> m_address_table;
 355
 356   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
 357   auto_vec <address_info *, 32> m_address_list;
 358 };
 359
 360 /* If EXPR is an SSA name and not a default definition, return the
 361    defining statement, otherwise return null.  */
 362
 363 static gimple *
 364 maybe_get_stmt (tree expr)
 365 {
 366   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
 367     return SSA_NAME_DEF_STMT (expr);
 368   return NULL;
 369 }
 370
 371 /* Like maybe_get_stmt, but also return null if the defining
 372    statement isn't an assignment.  */
 373
 374 static gassign *
 375 maybe_get_assign (tree expr)
 376 {
 377   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
 378 }
 379
 380 /* Return true if this pass should look through a cast of expression FROM
 381    to type TYPE when analyzing pieces of an address.  */
 382
 383 static bool
 384 look_through_cast_p (tree type, tree from)
 385 {
 386   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
 387           && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
 388 }
 389
 390 /* Strip all conversions of integers or pointers from EXPR, regardless
 391    of whether the conversions are nops.  This is useful in the context
 392    of this pass because we're not trying to fold or simulate the
 393    expression; we just want to see how it's structured.  */
 394
 395 static tree
 396 strip_casts (tree expr)
 397 {
 398   const unsigned int MAX_NITERS = 4;
 399
 400   tree type = TREE_TYPE (expr);
 401   while (CONVERT_EXPR_P (expr)
 402          && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
 403     expr = TREE_OPERAND (expr, 0);
 404
 405   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
 406     {
 407       gassign *assign = maybe_get_assign (expr);
 408       if (assign
 409           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
 410           && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
 411         expr = gimple_assign_rhs1 (assign);
 412       else
 413         break;
 414     }
 415   return expr;
 416 }
 417
 418 /* Compare two address_term_infos in the same address_info.  */
 419
 420 static int
 421 compare_address_terms (const void *a_uncast, const void *b_uncast)
 422 {
 423   const address_term_info *a = (const address_term_info *) a_uncast;
 424   const address_term_info *b = (const address_term_info *) b_uncast;
 425
 426   if (a->expr != b->expr)
 427     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
 428
 429   if (a->multiplier != b->multiplier)
 430     return a->multiplier < b->multiplier ? -1 : 1;
 431
 432   return 0;
 433 }
 434
 435 /* Dump ADDRESS using flags FLAGS.  */
 436
 437 static void
 438 dump_address_info (dump_flags_t flags, address_info &address)
 439 {
 440   if (address.base)
 441     dump_printf (flags, "%T + ", address.base);
 442   for (unsigned int i = 0; i < address.terms.length (); ++i)
 443     {
 444       if (i != 0)
 445         dump_printf (flags, " + ");
 446       dump_printf (flags, "%T", address.terms[i].expr);
 447       if (address.terms[i].multiplier != 1)
 448         dump_printf (flags, " * %wd", address.terms[i].multiplier);
 449     }
 450   dump_printf (flags, " + [%wd, %wd]",
 451                address.min_offset, address.max_offset - 1);
 452 }
 453
 454 /* Hash an address_info based on its base and terms.  */
 455
 456 hashval_t
 457 address_info_hasher::hash (const address_info *info)
 458 {
 459   inchash::hash hash;
 460   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
 461   hash.add_int (info->terms.length ());
 462   for (unsigned int i = 0; i < info->terms.length (); ++i)
 463     {
 464       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
 465       hash.add_hwi (info->terms[i].multiplier);
 466     }
 467   return hash.end ();
 468 }
 469
 470 /* Return true if two address_infos have equal bases and terms.  Other
 471    properties might be different (such as the statement or constant
 472    offset range).  */
 473
 474 bool
 475 address_info_hasher::equal (const address_info *a, const address_info *b)
 476 {
 477   if (a->base != b->base
 478       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
 479     return false;
 480
 481   if (a->terms.length () != b->terms.length ())
 482     return false;
 483
 484   for (unsigned int i = 0; i < a->terms.length (); ++i)
 485     if (a->terms[i].expr != b->terms[i].expr
 486         || a->terms[i].multiplier != b->terms[i].multiplier)
 487       return false;
 488
 489   return true;
 490 }
 491
 492 /* Return true if we want to version the loop, i.e. if we have a
 493    specific reason for doing so and no specific reason not to.  */
 494
 495 bool
 496 loop_info::worth_versioning_p () const
 497 {
 498   return (!rejected_p
 499           && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
 500 }
 501
 502 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
 503   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
 504 {
 505 }
 506
 507 /* Process BB before processing the blocks it dominates.  */
 508
 509 edge
 510 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 511 {
 512   m_range_analyzer.enter (bb);
 513
 514   if (bb == bb->loop_father->header)
 515     m_lv.prune_loop_conditions (bb->loop_father,
 516                                 m_range_analyzer.get_vr_values ());
 517
 518   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 519        gsi_next (&si))
 520     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
 521
 522   return NULL;
 523 }
 524
 525 /* Process BB after processing the blocks it dominates.  */
 526
 527 void
 528 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
 529 {
 530   m_range_analyzer.leave (bb);
 531 }
 532
 533 /* Decide whether to replace VAL with a new value in a versioned loop.
 534    Return the new value if so, otherwise return null.  */
 535
 536 tree
 537 loop_versioning::name_prop::get_value (tree val,
 538                                        gimple *stmt ATTRIBUTE_UNUSED)
 539 {
 540   if (TREE_CODE (val) == SSA_NAME
 541       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
 542     return build_one_cst (TREE_TYPE (val));
 543   return NULL_TREE;
 544 }
 545
 546 /* Initialize the structure to optimize FN.  */
 547
 548 loop_versioning::loop_versioning (function *fn)
 549   : m_fn (fn),
 550     m_nloops (number_of_loops (fn)),
 551     m_num_conditions (0),
 552     m_address_table (31)
 553 {
 554   bitmap_obstack_initialize (&m_bitmap_obstack);
 555   gcc_obstack_init (&m_obstack);
 556
 557   /* Initialize the loop information.  */
 558   m_loops.safe_grow_cleared (m_nloops, true);
 559   for (unsigned int i = 0; i < m_nloops; ++i)
 560     {
 561       m_loops[i].outermost = get_loop (m_fn, 0);
 562       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
 563     }
 564
 565   /* Initialize the list of blocks that belong to each loop.  */
 566   unsigned int nbbs = last_basic_block_for_fn (fn);
 567   m_next_block_in_loop.safe_grow (nbbs, true);
 568   basic_block bb;
 569   FOR_EACH_BB_FN (bb, fn)
 570     {
 571       loop_info &li = get_loop_info (bb->loop_father);
 572       m_next_block_in_loop[bb->index] = li.block_list;
 573       li.block_list = bb;
 574     }
 575
 576   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
 577      unvectorizable code, since it is the largest size that can be
 578      handled efficiently by scalar code.  omp_max_vf calculates the
 579      maximum number of bytes in a vector, when such a value is relevant
 580      to loop optimization.  */
 581   m_maximum_scale = estimated_poly_value (omp_max_vf ());
 582   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 583 }
 584
 585 loop_versioning::~loop_versioning ()
 586 {
 587   bitmap_obstack_release (&m_bitmap_obstack);
 588   obstack_free (&m_obstack, NULL);
 589 }
 590
 591 /* Return the maximum number of instructions allowed in LOOP before
 592    it becomes too big for versioning.
 593
 594    There are separate limits for inner and outer loops.  The limit for
 595    inner loops applies only to loops that benefit directly from versioning.
 596    The limit for outer loops applies to all code in the outer loop and
 597    its subloops that *doesn't* benefit directly from versioning; such code
 598    would be "taken along for the ride".  The idea is that if the cost of
 599    the latter is small, it is better to version outer loops rather than
 600    inner loops, both to reduce the number of repeated checks and to enable
 601    more of the loop nest to be optimized as a natural nest (e.g. by loop
 602    interchange or outer-loop vectorization).  */
 603
 604 unsigned int
 605 loop_versioning::max_insns_for_loop (class loop *loop)
 606 {
 607   return (loop->inner
 608           ? param_loop_versioning_max_outer_insns
 609           : param_loop_versioning_max_inner_insns);
 610 }
 611
 612 /* Return true if for cost reasons we should avoid versioning any loop
 613    that contains STMT.
 614
 615    Note that we don't need to check whether versioning is invalid for
 616    correctness reasons, since the versioning process does that for us.
 617    The conditions involved are too rare to be worth duplicating here.  */
 618
 619 bool
 620 loop_versioning::expensive_stmt_p (gimple *stmt)
 621 {
 622   if (gcall *call = dyn_cast <gcall *> (stmt))
 623     /* Assume for now that the time spent in an "expensive" call would
 624        overwhelm any saving from versioning.  */
 625     return !gimple_inexpensive_call_p (call);
 626   return false;
 627 }
 628
 629 /* Record that we want to version the loop that contains STMT for the
 630    case in which SSA name NAME is equal to 1.  We already know that NAME
 631    is invariant in the loop.  */
 632
 633 void
 634 loop_versioning::version_for_unity (gimple *stmt, tree name)
 635 {
 636   class loop *loop = loop_containing_stmt (stmt);
 637   loop_info &li = get_loop_info (loop);
 638
 639   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
 640     {
 641       /* This is the first time we've wanted to version LOOP for NAME.
 642          Keep track of the outermost loop that can handle all versioning
 643          checks in LI.  */
 644       class loop *outermost
 645         = outermost_invariant_loop_for_expr (loop, name);
 646       if (loop_depth (li.outermost) < loop_depth (outermost))
 647         li.outermost = outermost;
 648
 649       if (dump_enabled_p ())
 650         {
 651           dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
 652                            " for when %T == 1", name);
 653           if (outermost == loop)
 654             dump_printf (MSG_NOTE, "; cannot hoist check further");
 655           else
 656             {
 657               dump_printf (MSG_NOTE, "; could implement the check at loop"
 658                            " depth %d", loop_depth (outermost));
 659               if (loop_depth (li.outermost) > loop_depth (outermost))
 660                 dump_printf (MSG_NOTE, ", but other checks only allow"
 661                              " a depth of %d", loop_depth (li.outermost));
 662             }
 663           dump_printf (MSG_NOTE, "\n");
 664         }
 665
 666       m_num_conditions += 1;
 667     }
 668   else
 669     {
 670       /* This is a duplicate request.  */
 671       if (dump_enabled_p ())
 672         dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
 673                          " loop for when %T == 1\n", name);
 674     }
 675 }
 676
 677 /* Return true if OP1_TREE is constant and if in principle it is worth
 678    versioning an address fragment of the form:
 679
 680      i * OP1_TREE * OP2 * stride
 681
 682    for the case in which stride == 1.  This in practice means testing
 683    whether:
 684
 685      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
 686
 687    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
 688
 689 bool
 690 loop_versioning::acceptable_multiplier_p (tree op1_tree,
 691                                           unsigned HOST_WIDE_INT op2,
 692                                           unsigned HOST_WIDE_INT *result)
 693 {
 694   if (tree_fits_uhwi_p (op1_tree))
 695     {
 696       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
 697       /* The first part checks for overflow.  */
 698       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
 699         {
 700           if (result)
 701             *result = op1 * op2;
 702           return true;
 703         }
 704     }
 705   return false;
 706 }
 707
 708 /* Return true if it is worth using loop versioning on a memory access
 709    of type TYPE.  Store the size of the access in *SIZE if so.  */
 710
 711 bool
 712 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
 713 {
 714   return (TYPE_SIZE_UNIT (type)
 715           && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
 716 }
 717
 718 /* See whether OP is constant and whether we can multiply TERM by that
 719    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
 720    TERM if so.  */
 721
 722 bool
 723 loop_versioning::multiply_term_by (address_term_info &term, tree op)
 724 {
 725   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
 726 }
 727
 728 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
 729    is likely to be indexing an innermost dimension, returning the result
 730    as an INNER_* probability.  */
 731
 732 inner_likelihood
 733 loop_versioning::get_inner_likelihood (tree stride,
 734                                        unsigned HOST_WIDE_INT multiplier)
 735 {
 736   const unsigned int MAX_NITERS = 8;
 737
 738   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
 739      least one of those values is likely to be for the innermost dimension.
 740      Record in UNLIKELY_P if at least one of those values is unlikely to be
 741      for the innermost dimension.
 742
 743      E.g. for:
 744
 745        stride = cond ? a * b : 1
 746
 747      we should treat STRIDE as being a likely inner dimension, since
 748      we know that it is 1 under at least some circumstances.  (See the
 749      Fortran example below.)  However:
 750
 751        stride = a * b
 752
 753      on its own is unlikely to be for the innermost dimension, since
 754      that would require both a and b to be 1 at runtime.  */
 755   bool unlikely_p = false;
 756   tree worklist[MAX_NITERS];
 757   unsigned int length = 0;
 758   worklist[length++] = stride;
 759   for (unsigned int i = 0; i < length; ++i)
 760     {
 761       tree expr = worklist[i];
 762
 763       if (CONSTANT_CLASS_P (expr))
 764         {
 765           /* See if EXPR * MULTIPLIER would be consistent with an individual
 766              access or a small grouped access.  */
 767           if (acceptable_multiplier_p (expr, multiplier))
 768             return INNER_LIKELY;
 769           else
 770             unlikely_p = true;
 771         }
 772       else if (gimple *stmt = maybe_get_stmt (expr))
 773         {
 774           /* If EXPR is set by a PHI node, queue its arguments in case
 775              we find one that is consistent with an inner dimension.
 776
 777              An important instance of this is the Fortran handling of array
 778              descriptors, which calculates the stride of the inner dimension
 779              using a PHI equivalent of:
 780
 781                 raw_stride = a.dim[0].stride;
 782                 stride = raw_stride != 0 ? raw_stride : 1;
 783
 784              (Strides for outer dimensions do not treat 0 specially.)  */
 785           if (gphi *phi = dyn_cast <gphi *> (stmt))
 786             {
 787               unsigned int nargs = gimple_phi_num_args (phi);
 788               for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
 789                 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
 790             }
 791           /* If the value is set by an assignment, expect it to be read
 792              from memory (such as an array descriptor) rather than be
 793              calculated.  */
 794           else if (gassign *assign = dyn_cast <gassign *> (stmt))
 795             {
 796               if (!gimple_assign_load_p (assign))
 797                 unlikely_p = true;
 798             }
 799           /* Things like calls don't really tell us anything.  */
 800         }
 801     }
 802
 803   /* We didn't find any possible values of STRIDE that were likely to be
 804      for the innermost dimension.  If we found one that was actively
 805      unlikely to be for the innermost dimension, assume that that applies
 806      to STRIDE too.  */
 807   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
 808 }
 809
 810 /* Dump the likelihood that TERM's stride is for the innermost dimension.
 811    ADDRESS is the address that contains TERM.  */
 812
 813 void
 814 loop_versioning::dump_inner_likelihood (address_info &address,
 815                                         address_term_info &term)
 816 {
 817   if (term.inner_likelihood == INNER_LIKELY)
 818     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
 819                      " innermost dimension\n", term.stride);
 820   else if (term.inner_likelihood == INNER_UNLIKELY)
 821     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
 822                      " innermost dimension\n", term.stride);
 823   else
 824     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
 825                      " is the innermost dimension\n", term.stride);
 826 }
 827
 828 /* The caller has identified that STRIDE is the stride of interest
 829    in TERM, and that the stride is applied in OP_LOOP.  Record this
 830    information in TERM, deciding whether STRIDE is likely to be for
 831    the innermost dimension of an array and whether it represents a
 832    versioning opportunity.  ADDRESS is the address that contains TERM.  */
 833
 834 void
 835 loop_versioning::analyze_stride (address_info &address,
 836                                  address_term_info &term,
 837                                  tree stride, class loop *op_loop)
 838 {
 839   term.stride = stride;
 840
 841   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
 842   if (dump_enabled_p ())
 843     dump_inner_likelihood (address, term);
 844
 845   /* To be a versioning opportunity we require:
 846
 847      - The multiplier applied by TERM is equal to the access size,
 848        so that when STRIDE is 1, the accesses in successive loop
 849        iterations are consecutive.
 850
 851        This is deliberately conservative.  We could relax it to handle
 852        other cases (such as those with gaps between iterations) if we
 853        find any real testcases for which it's useful.
 854
 855      - the stride is applied in the same loop as STMT rather than
 856        in an outer loop.  Although versioning for strides applied in
 857        outer loops could help in some cases -- such as enabling
 858        more loop interchange -- the savings are much lower than for
 859        inner loops.
 860
 861      - the stride is an SSA name that is invariant in STMT's loop,
 862        since otherwise versioning isn't possible.  */
 863   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
 864   if (term.multiplier == access_size
 865       && address.loop == op_loop
 866       && TREE_CODE (stride) == SSA_NAME
 867       && expr_invariant_in_loop_p (address.loop, stride))
 868     {
 869       term.versioning_opportunity_p = true;
 870       if (dump_enabled_p ())
 871         dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
 872                          " opportunity\n", stride);
 873     }
 874 }
 875
 876 /* See whether address term TERM (which belongs to ADDRESS) is the result
 877    of multiplying a varying SSA name by a loop-invariant SSA name.
 878    Return true and update TERM if so.
 879
 880    This handles both cases that SCEV might handle, such as:
 881
 882      for (int i = 0; i < n; ++i)
 883        res += a[i * stride];
 884
 885    and ones in which the term varies arbitrarily between iterations, such as:
 886
 887      for (int i = 0; i < n; ++i)
 888        res += a[index[i] * stride];  */
 889
 890 bool
 891 loop_versioning::find_per_loop_multiplication (address_info &address,
 892                                                address_term_info &term)
 893 {
 894   gassign *mult = maybe_get_assign (term.expr);
 895   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
 896     return false;
 897
 898   class loop *mult_loop = loop_containing_stmt (mult);
 899   if (!loop_outer (mult_loop))
 900     return false;
 901
 902   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
 903   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
 904   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
 905     return false;
 906
 907   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
 908   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
 909   if (invariant1_p == invariant2_p)
 910     return false;
 911
 912   /* Make sure that the loop invariant is OP2 rather than OP1.  */
 913   if (invariant1_p)
 914     std::swap (op1, op2);
 915
 916   if (dump_enabled_p ())
 917     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
 918                      " * loop-invariant %T\n", term.expr, op1, op2);
 919   analyze_stride (address, term, op2, mult_loop);
 920   return true;
 921 }
 922
 923 /* Try to use scalar evolutions to find an address stride for TERM,
 924    which belongs to ADDRESS.  Return true and update TERM if so.
 925
 926    Here we are interested in any evolution information we can find,
 927    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
 928    an outer loop obviously iterates over the inner dimension of an array,
 929    that information can help us eliminate worthless versioning opportunities
 930    in inner loops.  */
 931
 932 bool
 933 loop_versioning::analyze_term_using_scevs (address_info &address,
 934                                            address_term_info &term)
 935 {
 936   gimple *setter = maybe_get_stmt (term.expr);
 937   if (!setter)
 938     return false;
 939
 940   class loop *wrt_loop = loop_containing_stmt (setter);
 941   if (!loop_outer (wrt_loop))
 942     return false;
 943
 944   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
 945   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
 946     {
 947       if (dump_enabled_p ())
 948         dump_printf_loc (MSG_NOTE, address.stmt,
 949                          "address term %T = %T\n", term.expr, chrec);
 950
 951       /* Peel casts and accumulate constant multiplications, up to the
 952          limit allowed by M_MAXIMUM_SCALE.  */
 953       tree stride = strip_casts (CHREC_RIGHT (chrec));
 954       while (TREE_CODE (stride) == MULT_EXPR
 955              && multiply_term_by (term, TREE_OPERAND (stride, 1)))
 956         stride = strip_casts (TREE_OPERAND (stride, 0));
 957
 958       gassign *assign;
 959       while ((assign = maybe_get_assign (stride))
 960              && gimple_assign_rhs_code (assign) == MULT_EXPR
 961              && multiply_term_by (term, gimple_assign_rhs2 (assign)))
 962         {
 963           if (dump_enabled_p ())
 964             dump_printf_loc (MSG_NOTE, address.stmt,
 965                              "looking through %G", assign);
 966           stride = strip_casts (gimple_assign_rhs1 (assign));
 967         }
 968
 969       analyze_stride (address, term, stride, get_chrec_loop (chrec));
 970       return true;
 971     }
 972
 973   return false;
 974 }
 975
 976 /* Address term TERM is an arbitrary term that provides no versioning
 977    opportunities.  Analyze it to see whether it contains any likely
 978    inner strides, so that we don't mistakenly version for other
 979    (less likely) candidates.
 980
 981    This copes with invariant innermost indices such as:
 982
 983      x(i, :) = 100
 984
 985    where the "i" component of the address is invariant in the loop
 986    but provides the real inner stride.
 987
 988    ADDRESS is the address that contains TERM.  */
 989
 990 void
 991 loop_versioning::analyze_arbitrary_term (address_info &address,
 992                                          address_term_info &term)
 993
 994 {
 995   /* A multiplication offers two potential strides.  Pick the one that
 996      is most likely to be an innermost stride.  */
 997   tree expr = term.expr, alt = NULL_TREE;
 998   gassign *mult = maybe_get_assign (expr);
 999   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
1000     {
1001       expr = strip_casts (gimple_assign_rhs1 (mult));
1002       alt = strip_casts (gimple_assign_rhs2 (mult));
1003     }
1004   term.stride = expr;
1005   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
1006   if (alt)
1007     {
1008       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
1009       if (alt_l > term.inner_likelihood)
1010         {
1011           term.stride = alt;
1012           term.inner_likelihood = alt_l;
1013         }
1014     }
1015   if (dump_enabled_p ())
1016     dump_inner_likelihood (address, term);
1017 }
1018
1019 /* Try to identify loop strides in ADDRESS and try to choose realistic
1020    versioning opportunities based on these strides.
1021
1022    The main difficulty here isn't finding strides that could be used
1023    in a version check (that's pretty easy).  The problem instead is to
1024    avoid versioning for some stride S that is unlikely ever to be 1 at
1025    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1026    code bloat, while adding S == 1 to more realistic version conditions
1027    would lose the optimisation opportunity offered by those other conditions.
1028
1029    For example, versioning for a stride of 1 in the Fortran code:
1030
1031      integer :: a(:,:)
1032      a(1,:) = 1
1033
1034    is not usually a good idea, since the assignment is iterating over
1035    an outer dimension and is relatively unlikely to have a stride of 1.
1036    (It isn't impossible, since the inner dimension might be 1, or the
1037    array might be transposed.)  Similarly, in:
1038
1039      integer :: a(:,:), b(:,:)
1040      b(:,1) = a(1,:)
1041
1042    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1043    Versioning for when both strides are 1 would lose most of the benefit
1044    of versioning for b's access.
1045
1046    The approach we take is as follows:
1047
1048    - Analyze each term to see whether it has an identifiable stride,
1049      regardless of which loop applies the stride.
1050
1051    - Evaluate the likelihood that each such stride is for the innermost
1052      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1053
1054    - If there is a single "likely" innermost stride, and that stride is
1055      applied in the loop that contains STMT, version the loop for when the
1056      stride is 1.  This deals with the cases in which we're fairly
1057      confident of doing the right thing, such as the b(:,1) reference above.
1058
1059    - If there are no "likely" innermost strides, and the loop that contains
1060      STMT uses a stride that we rated as "don't know", version for when
1061      that stride is 1.  This is principally used for C code such as:
1062
1063        for (int i = 0; i < n; ++i)
1064          a[i * x] = ...;
1065
1066      and:
1067
1068        for (int j = 0; j < n; ++j)
1069          for (int i = 0; i < n; ++i)
1070            a[i * x + j * y] = ...;
1071
1072      where nothing in the way "x" and "y" are set gives a hint as to
1073      whether "i" iterates over the innermost dimension of the array.
1074      In these situations it seems reasonable to assume the
1075      programmer has nested the loops appropriately (although of course
1076      there are examples like GEMM in which this assumption doesn't hold
1077      for all accesses in the loop).
1078
1079      This case is also useful for the Fortran equivalent of the
1080      above C code.  */
1081
1082 void
1083 loop_versioning::analyze_address_fragment (address_info &address)
1084 {
1085   if (dump_enabled_p ())
1086     {
1087       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1088       dump_address_info (MSG_NOTE, address);
1089       dump_printf (MSG_NOTE, "\n");
1090     }
1091
1092   /* Analyze each component of the sum to see whether it involves an
1093      apparent stride.
1094
1095      There is an overlap between the addresses that
1096      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1097      but the former is much cheaper than SCEV analysis, so try it first.  */
1098   for (unsigned int i = 0; i < address.terms.length (); ++i)
1099     if (!find_per_loop_multiplication (address, address.terms[i])
1100         && !analyze_term_using_scevs (address, address.terms[i])
1101         && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1102       analyze_arbitrary_term (address, address.terms[i]);
1103
1104   /* Check for strides that are likely to be for the innermost dimension.
1105
1106      1. If there is a single likely inner stride, if it is an SSA name,
1107         and if it is worth versioning the loop for when the SSA name
1108         equals 1, record that we want to do so.
1109
1110      2. Otherwise, if there any likely inner strides, bail out.  This means
1111         one of:
1112
1113         (a) There are multiple likely inner strides.  This suggests we're
1114             confused and be can't be confident of doing the right thing.
1115
1116         (b) There is a single likely inner stride and it is a constant
1117             rather than an SSA name.  This can mean either that the access
1118             is a natural one without any variable strides, such as:
1119
1120               for (int i = 0; i < n; ++i)
1121                 a[i] += 1;
1122
1123             or that a variable stride is applied to an outer dimension,
1124             such as:
1125
1126               for (int i = 0; i < n; ++i)
1127                 for (int j = 0; j < n; ++j)
1128                   a[j * stride][i] += 1;
1129
1130         (c) There is a single likely inner stride, and it is an SSA name,
1131             but it isn't a worthwhile versioning opportunity.  This usually
1132             means that the variable stride is applied by an outer loop,
1133             such as:
1134
1135               for (int i = 0; i < n; ++i)
1136                 for (int j = 0; j < n; ++j)
1137                   a[j][i * stride] += 1;
1138
1139             or (using an example with a more natural loop nesting):
1140
1141               for (int i = 0; i < n; ++i)
1142                 for (int j = 0; j < n; ++j)
1143                   a[i][j] += b[i * stride];
1144
1145             in cases where b[i * stride] cannot (yet) be hoisted for
1146             aliasing reasons.
1147
1148      3. If there are no likely inner strides, fall through to the next
1149         set of checks.
1150
1151      Pointer equality is enough to check for uniqueness in (1), since we
1152      only care about SSA names.  */
1153   tree chosen_stride = NULL_TREE;
1154   tree version_stride = NULL_TREE;
1155   for (unsigned int i = 0; i < address.terms.length (); ++i)
1156     if (chosen_stride != address.terms[i].stride
1157         && address.terms[i].inner_likelihood == INNER_LIKELY)
1158       {
1159         if (chosen_stride)
1160           return;
1161         chosen_stride = address.terms[i].stride;
1162         if (address.terms[i].versioning_opportunity_p)
1163           version_stride = chosen_stride;
1164       }
1165
1166   /* If there are no likely inner strides, see if there is a single
1167      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1168      See the comment above the function for the cases that this code
1169      handles.  */
1170   if (!chosen_stride)
1171     for (unsigned int i = 0; i < address.terms.length (); ++i)
1172       if (version_stride != address.terms[i].stride
1173           && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1174           && address.terms[i].versioning_opportunity_p)
1175         {
1176           if (version_stride)
1177             return;
1178           version_stride = address.terms[i].stride;
1179         }
1180
1181   if (version_stride)
1182     version_for_unity (address.stmt, version_stride);
1183 }
1184
1185 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1186    TYPE_SIZE bytes and record this address fragment for later processing.
1187    STMT is the statement that contains the address.  */
1188
1189 void
1190 loop_versioning::record_address_fragment (gimple *stmt,
1191                                           unsigned HOST_WIDE_INT type_size,
1192                                           tree expr,
1193                                           unsigned HOST_WIDE_INT multiplier,
1194                                           HOST_WIDE_INT offset)
1195 {
1196   /* We're only interested in computed values.  */
1197   if (TREE_CODE (expr) != SSA_NAME)
1198     return;
1199
1200   /* Quick exit if no part of the address is calculated in STMT's loop,
1201      since such addresses have no versioning opportunities.  */
1202   class loop *loop = loop_containing_stmt (stmt);
1203   if (expr_invariant_in_loop_p (loop, expr))
1204     return;
1205
1206   /* Set up an address_info for EXPR * MULTIPLIER.  */
1207   address_info *address = XOBNEW (&m_obstack, address_info);
1208   new (address) address_info;
1209   address->stmt = stmt;
1210   address->loop = loop;
1211   address->base = NULL_TREE;
1212   address->terms.quick_grow (1);
1213   address->terms[0].expr = expr;
1214   address->terms[0].multiplier = multiplier;
1215   address->terms[0].stride = NULL_TREE;
1216   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1217   address->terms[0].versioning_opportunity_p = false;
1218   address->min_offset = offset;
1219
1220   /* Peel apart the expression into a sum of address_terms, where each
1221      term is multiplied by a constant.  Treat a + b and a - b the same,
1222      since it doesn't matter for our purposes whether an address is
1223      increasing or decreasing.  Distribute (a + b) * constant into
1224      a * constant + b * constant.
1225
1226      We don't care which loop each term belongs to, since we want to
1227      examine as many candidate strides as possible when determining
1228      which is likely to be for the innermost dimension.  We therefore
1229      don't limit the search to statements in STMT's loop.  */
1230   for (unsigned int i = 0; i < address->terms.length (); )
1231     {
1232       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1233         {
1234           tree_code code = gimple_assign_rhs_code (assign);
1235           if (code == PLUS_EXPR
1236               || code == POINTER_PLUS_EXPR
1237               || code == MINUS_EXPR)
1238             {
1239               tree op1 = gimple_assign_rhs1 (assign);
1240               tree op2 = gimple_assign_rhs2 (assign);
1241               if (TREE_CODE (op2) == INTEGER_CST)
1242                 {
1243                   address->terms[i].expr = strip_casts (op1);
1244                   /* This is heuristic only, so don't worry about truncation
1245                      or overflow.  */
1246                   address->min_offset += (TREE_INT_CST_LOW (op2)
1247                                           * address->terms[i].multiplier);
1248                   continue;
1249                 }
1250               else if (address->terms.length () < address_info::MAX_TERMS)
1251                 {
1252                   unsigned int j = address->terms.length ();
1253                   address->terms.quick_push (address->terms[i]);
1254                   address->terms[i].expr = strip_casts (op1);
1255                   address->terms[j].expr = strip_casts (op2);
1256                   continue;
1257                 }
1258             }
1259           if (code == MULT_EXPR)
1260             {
1261               tree op1 = gimple_assign_rhs1 (assign);
1262               tree op2 = gimple_assign_rhs2 (assign);
1263               if (multiply_term_by (address->terms[i], op2))
1264                 {
1265                   address->terms[i].expr = strip_casts (op1);
1266                   continue;
1267                 }
1268             }
1269           if (CONVERT_EXPR_CODE_P (code))
1270             {
1271               tree op1 = gimple_assign_rhs1 (assign);
1272               address->terms[i].expr = strip_casts (op1);
1273               continue;
1274             }
1275         }
1276       i += 1;
1277     }
1278
1279   /* Peel off any symbolic pointer.  */
1280   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1281       && address->terms[0].multiplier == 1)
1282     {
1283       if (address->terms.length () == 1)
1284         {
1285           obstack_free (&m_obstack, address);
1286           return;
1287         }
1288       address->base = address->terms[0].expr;
1289       address->terms.ordered_remove (0);
1290     }
1291
1292   /* Require all remaining terms to be SSA names.  (This could be false
1293      for unfolded statements, but they aren't worth dealing with.)  */
1294   for (unsigned int i = 0; i < address->terms.length (); ++i)
1295     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1296       {
1297         obstack_free (&m_obstack, address);
1298         return;
1299       }
1300
1301   /* The loop above set MIN_OFFSET based on the first byte of the
1302      referenced data.  Calculate the end + 1.  */
1303   address->max_offset = address->min_offset + type_size;
1304
1305   /* Put the terms into a canonical order for the hash table lookup below.  */
1306   address->terms.qsort (compare_address_terms);
1307
1308   if (dump_enabled_p ())
1309     {
1310       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1311       if (multiplier != 1)
1312         dump_printf (MSG_NOTE, " * %wd", multiplier);
1313       dump_printf (MSG_NOTE, " = ");
1314       dump_address_info (MSG_NOTE, *address);
1315       dump_printf (MSG_NOTE, "\n");
1316     }
1317
1318   /* Pool address information with the same terms (but potentially
1319      different offsets).  */
1320   address_info **slot = m_address_table.find_slot (address, INSERT);
1321   if (address_info *old_address = *slot)
1322     {
1323       /* We've already seen an address with the same terms.  Extend the
1324          offset range to account for this access.  Doing this can paper
1325          over gaps, such as in:
1326
1327            a[i * stride * 4] + a[i * stride * 4 + 3];
1328
1329          where nothing references "+ 1" or "+ 2".  However, the vectorizer
1330          handles such gapped accesses without problems, so it's not worth
1331          trying to exclude them.  */
1332       if (old_address->min_offset > address->min_offset)
1333         old_address->min_offset = address->min_offset;
1334       if (old_address->max_offset < address->max_offset)
1335         old_address->max_offset = address->max_offset;
1336       obstack_free (&m_obstack, address);
1337     }
1338   else
1339     {
1340       /* This is the first time we've seen an address with these terms.  */
1341       *slot = address;
1342       m_address_list.safe_push (address);
1343     }
1344 }
1345
1346 /* Analyze expression EXPR, which occurs in STMT.  */
1347
1348 void
1349 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1350 {
1351   unsigned HOST_WIDE_INT type_size;
1352
1353   while (handled_component_p (expr))
1354     {
1355       /* See whether we can use versioning to avoid a multiplication
1356          in an array index.  */
1357       if (TREE_CODE (expr) == ARRAY_REF
1358           && acceptable_type_p (TREE_TYPE (expr), &type_size))
1359         record_address_fragment (stmt, type_size,
1360                                  TREE_OPERAND (expr, 1), type_size, 0);
1361       expr = TREE_OPERAND (expr, 0);
1362     }
1363
1364   /* See whether we can use versioning to avoid a multiplication
1365      in the pointer calculation of a MEM_REF.  */
1366   if (TREE_CODE (expr) == MEM_REF
1367       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1368     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1369                              /* This is heuristic only, so don't worry
1370                                 about truncation or overflow.  */
1371                              TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1372
1373   /* These would be easy to handle if they existed at this stage.  */
1374   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1375 }
1376
1377 /* Analyze all the statements in BB looking for useful version checks.
1378    Return true on success, false if something prevents the block from
1379    being versioned.  */
1380
1381 bool
1382 loop_versioning::analyze_block (basic_block bb)
1383 {
1384   class loop *loop = bb->loop_father;
1385   loop_info &li = get_loop_info (loop);
1386   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1387        gsi_next (&gsi))
1388     {
1389       gimple *stmt = gsi_stmt (gsi);
1390       if (is_gimple_debug (stmt))
1391         continue;
1392
1393       if (expensive_stmt_p (stmt))
1394         {
1395           if (dump_enabled_p ())
1396             dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1397                              " prevents versioning: %G", stmt);
1398           return false;
1399         }
1400
1401       /* Only look for direct versioning opportunities in inner loops
1402          since the benefit tends to be much smaller for outer loops.  */
1403       if (!loop->inner)
1404         {
1405           unsigned int nops = gimple_num_ops (stmt);
1406           for (unsigned int i = 0; i < nops; ++i)
1407             if (tree op = gimple_op (stmt, i))
1408               analyze_expr (stmt, op);
1409         }
1410
1411       /* The point of the instruction limit is to prevent excessive
1412          code growth, so this is a size-based estimate even though
1413          the optimization is aimed at speed.  */
1414       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1415     }
1416
1417   return true;
1418 }
1419
1420 /* Analyze all the blocks in the function, looking for useful version checks.
1421    Return true if we found one.  */
1422
1423 bool
1424 loop_versioning::analyze_blocks ()
1425 {
1426   AUTO_DUMP_SCOPE ("analyze_blocks",
1427                    dump_user_location_t::from_function_decl (m_fn->decl));
1428
1429   /* For now we don't try to version the whole function, although
1430      versioning at that level could be useful in some cases.  */
1431   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1432
1433   class loop *loop;
1434   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1435     {
1436       loop_info &linfo = get_loop_info (loop);
1437
1438       /* Ignore cold loops.  */
1439       if (!optimize_loop_for_speed_p (loop))
1440         linfo.rejected_p = true;
1441
1442       /* See whether an inner loop prevents versioning of this loop.  */
1443       if (!linfo.rejected_p)
1444         for (class loop *inner = loop->inner; inner; inner = inner->next)
1445           if (get_loop_info (inner).rejected_p)
1446             {
1447               linfo.rejected_p = true;
1448               break;
1449             }
1450
1451       /* If versioning the loop is still a possibility, examine the
1452          statements in the loop to look for versioning opportunities.  */
1453       if (!linfo.rejected_p)
1454         {
1455           void *start_point = obstack_alloc (&m_obstack, 0);
1456
1457           for (basic_block bb = linfo.block_list; bb;
1458                bb = m_next_block_in_loop[bb->index])
1459             if (!analyze_block (bb))
1460               {
1461                 linfo.rejected_p = true;
1462                 break;
1463             }
1464
1465           if (!linfo.rejected_p)
1466             {
1467               /* Process any queued address fragments, now that we have
1468                  complete grouping information.  */
1469               address_info *address;
1470               unsigned int i;
1471               FOR_EACH_VEC_ELT (m_address_list, i, address)
1472                 analyze_address_fragment (*address);
1473             }
1474
1475           m_address_table.empty ();
1476           m_address_list.truncate (0);
1477           obstack_free (&m_obstack, start_point);
1478         }
1479     }
1480
1481   return m_num_conditions != 0;
1482 }
1483
1484 /* Use the ranges in VRS to remove impossible versioning conditions from
1485    LOOP.  */
1486
1487 void
1488 loop_versioning::prune_loop_conditions (class loop *loop, vr_values *vrs)
1489 {
1490   loop_info &li = get_loop_info (loop);
1491
1492   int to_remove = -1;
1493   bitmap_iterator bi;
1494   unsigned int i;
1495   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1496     {
1497       tree name = ssa_name (i);
1498       const value_range_equiv *vr = vrs->get_value_range (name);
1499       if (vr && !vr->may_contain_p (build_one_cst (TREE_TYPE (name))))
1500         {
1501           if (dump_enabled_p ())
1502             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1503                              "%T can never be 1 in this loop\n", name);
1504
1505           if (to_remove >= 0)
1506             bitmap_clear_bit (&li.unity_names, to_remove);
1507           to_remove = i;
1508           m_num_conditions -= 1;
1509         }
1510     }
1511   if (to_remove >= 0)
1512     bitmap_clear_bit (&li.unity_names, to_remove);
1513 }
1514
1515 /* Remove any scheduled loop version conditions that will never be true.
1516    Return true if any remain.  */
1517
1518 bool
1519 loop_versioning::prune_conditions ()
1520 {
1521   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1522                    dump_user_location_t::from_function_decl (m_fn->decl));
1523
1524   calculate_dominance_info (CDI_DOMINATORS);
1525   lv_dom_walker dom_walker (*this);
1526   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1527   return m_num_conditions != 0;
1528 }
1529
1530 /* Merge the version checks for INNER into immediately-enclosing loop
1531    OUTER.  */
1532
1533 void
1534 loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1535 {
1536   loop_info &inner_li = get_loop_info (inner);
1537   loop_info &outer_li = get_loop_info (outer);
1538
1539   if (dump_enabled_p ())
1540     {
1541       bitmap_iterator bi;
1542       unsigned int i;
1543       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1544         if (!bitmap_bit_p (&outer_li.unity_names, i))
1545           dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1546                            "hoisting check that %T == 1 to outer loop\n",
1547                            ssa_name (i));
1548     }
1549
1550   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1551   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1552     outer_li.outermost = inner_li.outermost;
1553 }
1554
1555 /* Add LOOP to the queue of loops to version.  */
1556
1557 void
1558 loop_versioning::add_loop_to_queue (class loop *loop)
1559 {
1560   loop_info &li = get_loop_info (loop);
1561
1562   if (dump_enabled_p ())
1563     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1564                      "queuing this loop for versioning\n");
1565   m_loops_to_version.safe_push (loop);
1566
1567   /* Don't try to version superloops.  */
1568   li.rejected_p = true;
1569 }
1570
1571 /* Decide whether the cost model would allow us to version LOOP,
1572    either directly or as part of a parent loop, and return true if so.
1573    This does not imply that the loop is actually worth versioning in its
1574    own right, just that it would be valid to version it if something
1575    benefited.
1576
1577    We have already made this decision for all inner loops of LOOP.  */
1578
1579 bool
1580 loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1581 {
1582   loop_info &li = get_loop_info (loop);
1583
1584   if (li.rejected_p)
1585     return false;
1586
1587   /* Examine the decisions made for inner loops.  */
1588   for (class loop *inner = loop->inner; inner; inner = inner->next)
1589     {
1590       loop_info &inner_li = get_loop_info (inner);
1591       if (inner_li.rejected_p)
1592         {
1593           if (dump_enabled_p ())
1594             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1595                              "not versioning this loop because one of its"
1596                              " inner loops should not be versioned\n");
1597           return false;
1598         }
1599
1600       if (inner_li.worth_versioning_p ())
1601         li.subloops_benefit_p = true;
1602
1603       /* Accumulate the number of instructions from subloops that are not
1604          the innermost, or that don't benefit from versioning.  Only the
1605          instructions from innermost loops that benefit from versioning
1606          should be weighed against loop-versioning-max-inner-insns;
1607          everything else should be weighed against
1608          loop-versioning-max-outer-insns.  */
1609       if (!inner_li.worth_versioning_p () || inner->inner)
1610         {
1611           if (dump_enabled_p ())
1612             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1613                              "counting %d instructions from this loop"
1614                              " against its parent loop\n", inner_li.num_insns);
1615           li.num_insns += inner_li.num_insns;
1616         }
1617     }
1618
1619   /* Enforce the size limits.  */
1620   if (li.worth_versioning_p ())
1621     {
1622       unsigned int max_num_insns = max_insns_for_loop (loop);
1623       if (dump_enabled_p ())
1624         dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1625                          "this loop has %d instructions, against"
1626                          " a versioning limit of %d\n",
1627                          li.num_insns, max_num_insns);
1628       if (li.num_insns > max_num_insns)
1629         {
1630           if (dump_enabled_p ())
1631             dump_printf_loc (MSG_MISSED_OPTIMIZATION
1632                              | MSG_PRIORITY_USER_FACING,
1633                              find_loop_location (loop),
1634                              "this loop is too big to version");
1635           return false;
1636         }
1637     }
1638
1639   /* Hoist all version checks from subloops to this loop.  */
1640   for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1641     merge_loop_info (loop, subloop);
1642
1643   return true;
1644 }
1645
1646 /* Decide which loops to version and add them to the versioning queue.
1647    Return true if there are any loops to version.  */
1648
1649 bool
1650 loop_versioning::make_versioning_decisions ()
1651 {
1652   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1653                    dump_user_location_t::from_function_decl (m_fn->decl));
1654
1655   class loop *loop;
1656   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1657     {
1658       loop_info &linfo = get_loop_info (loop);
1659       if (decide_whether_loop_is_versionable (loop))
1660         {
1661           /* Commit to versioning LOOP directly if we can't hoist the
1662              version checks any further.  */
1663           if (linfo.worth_versioning_p ()
1664               && (loop_depth (loop) == 1 || linfo.outermost == loop))
1665             add_loop_to_queue (loop);
1666         }
1667       else
1668         {
1669           /* We can't version this loop, so individually version any
1670              subloops that would benefit and haven't been versioned yet.  */
1671           linfo.rejected_p = true;
1672           for (class loop *subloop = loop->inner; subloop;
1673                subloop = subloop->next)
1674             if (get_loop_info (subloop).worth_versioning_p ())
1675               add_loop_to_queue (subloop);
1676         }
1677     }
1678
1679   return !m_loops_to_version.is_empty ();
1680 }
1681
1682 /* Attempt to implement loop versioning for LOOP, using the information
1683    cached in the associated loop_info.  Return true on success.  */
1684
1685 bool
1686 loop_versioning::version_loop (class loop *loop)
1687 {
1688   loop_info &li = get_loop_info (loop);
1689
1690   /* Build up a condition that selects the original loop instead of
1691      the simplified loop.  */
1692   tree cond = boolean_false_node;
1693   bitmap_iterator bi;
1694   unsigned int i;
1695   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1696     {
1697       tree name = ssa_name (i);
1698       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1699                                  build_one_cst (TREE_TYPE (name)));
1700       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1701     }
1702
1703   /* Convert the condition into a suitable gcond.  */
1704   gimple_seq stmts = NULL;
1705   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1706
1707   /* Version the loop.  */
1708   initialize_original_copy_tables ();
1709   basic_block cond_bb;
1710   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1711                                     profile_probability::unlikely (),
1712                                     profile_probability::likely (),
1713                                     profile_probability::unlikely (),
1714                                     profile_probability::likely (), true);
1715   free_original_copy_tables ();
1716   if (!li.optimized_loop)
1717     {
1718       if (dump_enabled_p ())
1719         dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1720                          "tried but failed to version this loop for when"
1721                          " certain strides are 1\n");
1722       return false;
1723     }
1724
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1727                      "versioned this loop for when certain strides are 1\n");
1728
1729   /* Insert the statements that feed COND.  */
1730   if (stmts)
1731     {
1732       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1733       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1734     }
1735
1736   return true;
1737 }
1738
1739 /* Attempt to version all loops in the versioning queue.  */
1740
1741 void
1742 loop_versioning::implement_versioning_decisions ()
1743 {
1744   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1745      user-facing at this point.  */
1746
1747   bool any_succeeded_p = false;
1748   class loop *loop;
1749   unsigned int i;
1750   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1751     if (version_loop (loop))
1752       any_succeeded_p = true;
1753   if (!any_succeeded_p)
1754     return;
1755
1756   update_ssa (TODO_update_ssa);
1757
1758   /* Simplify the new loop, which is used when COND is false.  */
1759   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1760     {
1761       loop_info &linfo = get_loop_info (loop);
1762       if (linfo.optimized_loop)
1763         name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1764     }
1765 }
1766
1767 /* Run the pass and return a set of TODO_* flags.  */
1768
1769 unsigned int
1770 loop_versioning::run ()
1771 {
1772   gcc_assert (scev_initialized_p ());
1773
1774   if (analyze_blocks ()
1775       && prune_conditions ()
1776       && make_versioning_decisions ())
1777     implement_versioning_decisions ();
1778
1779   return 0;
1780 }
1781
1782 /* Loop versioning pass.  */
1783
1784 const pass_data pass_data_loop_versioning =
1785 {
1786   GIMPLE_PASS, /* type */
1787   "lversion", /* name */
1788   OPTGROUP_LOOP, /* optinfo_flags */
1789   TV_LOOP_VERSIONING, /* tv_id */
1790   PROP_cfg, /* properties_required */
1791   0, /* properties_provided */
1792   0, /* properties_destroyed */
1793   0, /* todo_flags_start */
1794   0, /* todo_flags_finish */
1795 };
1796
1797 class pass_loop_versioning : public gimple_opt_pass
1798 {
1799 public:
1800   pass_loop_versioning (gcc::context *ctxt)
1801     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1802   {}
1803
1804   /* opt_pass methods: */
1805   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1806   virtual unsigned int execute (function *);
1807 };
1808
1809 unsigned int
1810 pass_loop_versioning::execute (function *fn)
1811 {
1812   if (number_of_loops (fn) <= 1)
1813     return 0;
1814
1815   return loop_versioning (fn).run ();
1816 }
1817
1818 } // anon namespace
1819
1820 gimple_opt_pass *
1821 make_pass_loop_versioning (gcc::context *ctxt)
1822 {
1823   return new pass_loop_versioning (ctxt);
1824 }