gcc/gimple-loop-versioning.cc

   1 /* Loop versioning pass.
   2    Copyright (C) 2018-2020 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "backend.h"
  24 #include "tree.h"
  25 #include "gimple.h"
  26 #include "gimple-iterator.h"
  27 #include "tree-pass.h"
  28 #include "gimplify-me.h"
  29 #include "cfgloop.h"
  30 #include "tree-ssa-loop.h"
  31 #include "ssa.h"
  32 #include "tree-scalar-evolution.h"
  33 #include "tree-chrec.h"
  34 #include "tree-ssa-loop-ivopts.h"
  35 #include "fold-const.h"
  36 #include "tree-ssa-propagate.h"
  37 #include "tree-inline.h"
  38 #include "domwalk.h"
  39 #include "alloc-pool.h"
  40 #include "vr-values.h"
  41 #include "gimple-ssa-evrp-analyze.h"
  42 #include "tree-vectorizer.h"
  43 #include "omp-general.h"
  44 #include "predict.h"
  45 #include "tree-into-ssa.h"
  46
  47 namespace {
  48
  49 /* This pass looks for loops that could be simplified if certain loop
  50    invariant conditions were true.  It is effectively a form of loop
  51    splitting in which the pass produces the split conditions itself,
  52    instead of using ones that are already present in the IL.
  53
  54    Versioning for when strides are 1
  55    ---------------------------------
  56
  57    At the moment the only thing the pass looks for are memory references
  58    like:
  59
  60      for (auto i : ...)
  61        ...x[i * stride]...
  62
  63    It considers changing such loops to:
  64
  65      if (stride == 1)
  66        for (auto i : ...)    [A]
  67          ...x[i]...
  68      else
  69        for (auto i : ...)    [B]
  70          ...x[i * stride]...
  71
  72    This can have several benefits:
  73
  74    (1) [A] is often easier or cheaper to vectorize than [B].
  75
  76    (2) The scalar code in [A] is simpler than the scalar code in [B]
  77        (if the loops cannot be vectorized or need an epilogue loop).
  78
  79    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
  80
  81    (4) [A] has simpler address evolutions, which can help other passes
  82        like loop interchange.
  83
  84    The optimization is particularly useful for assumed-shape arrays in
  85    Fortran, where the stride of the innermost dimension depends on the
  86    array descriptor but is often equal to 1 in practice.  For example:
  87
  88      subroutine f1(x)
  89        real :: x(:)
  90        x(:) = 100
  91      end subroutine f1
  92
  93    generates the equivalent of:
  94
  95      raw_stride = *x.dim[0].stride;
  96      stride = raw_stride != 0 ? raw_stride : 1;
  97      x_base = *x.data;
  98      ...
  99      tmp1 = stride * S;
 100      tmp2 = tmp1 - stride;
 101      *x_base[tmp2] = 1.0e+2;
 102
 103    but in the common case that stride == 1, the last three statements
 104    simplify to:
 105
 106      tmp3 = S + -1;
 107      *x_base[tmp3] = 1.0e+2;
 108
 109    The optimization is in principle very simple.  The difficult parts are:
 110
 111    (a) deciding which parts of a general address calculation correspond
 112        to the inner dimension of an array, since this usually isn't explicit
 113        in the IL, and for C often isn't even explicit in the source code
 114
 115    (b) estimating when the transformation is worthwhile
 116
 117    Structure
 118    ---------
 119
 120    The pass has four phases:
 121
 122    (1) Walk through the statements looking for and recording potential
 123        versioning opportunities.  Stop if there are none.
 124
 125    (2) Use context-sensitive range information to see whether any versioning
 126        conditions are impossible in practice.  Remove them if so, and stop
 127        if no opportunities remain.
 128
 129        (We do this only after (1) to keep compile time down when no
 130        versioning opportunities exist.)
 131
 132    (3) Apply the cost model.  Decide which versioning opportunities are
 133        worthwhile and at which nesting level they should be applied.
 134
 135    (4) Attempt to version all the loops selected by (3), so that:
 136
 137          for (...)
 138            ...
 139
 140        becomes:
 141
 142          if (!cond)
 143            for (...) // Original loop
 144              ...
 145          else
 146            for (...) // New loop
 147              ...
 148
 149        Use the version condition COND to simplify the new loop.  */
 150
 151 /* Enumerates the likelihood that a particular value indexes the inner
 152    dimension of an array.  */
 153 enum inner_likelihood {
 154   INNER_UNLIKELY,
 155   INNER_DONT_KNOW,
 156   INNER_LIKELY
 157 };
 158
 159 /* Information about one term of an address_info.  */
 160 struct address_term_info
 161 {
 162   /* The value of the term is EXPR * MULTIPLIER.  */
 163   tree expr;
 164   unsigned HOST_WIDE_INT multiplier;
 165
 166   /* The stride applied by EXPR in each iteration of some unrecorded loop,
 167      or null if no stride has been identified.  */
 168   tree stride;
 169
 170   /* Enumerates the likelihood that EXPR indexes the inner dimension
 171      of an array.  */
 172   enum inner_likelihood inner_likelihood;
 173
 174   /* True if STRIDE == 1 is a versioning opportunity when considered
 175      in isolation.  */
 176   bool versioning_opportunity_p;
 177 };
 178
 179 /* Information about an address calculation, and the range of constant
 180    offsets applied to it.  */
 181 class address_info
 182 {
 183 public:
 184   static const unsigned int MAX_TERMS = 8;
 185
 186   /* One statement that calculates the address.  If multiple statements
 187      share the same address, we only record the first.  */
 188   gimple *stmt;
 189
 190   /* The loop containing STMT (cached for convenience).  If multiple
 191      statements share the same address, they all belong to this loop.  */
 192   class loop *loop;
 193
 194   /* A decomposition of the calculation into a sum of terms plus an
 195      optional base.  When BASE is provided, it is never an SSA name.
 196      Once initialization is complete, all members of TERMs are SSA names.  */
 197   tree base;
 198   auto_vec<address_term_info, MAX_TERMS> terms;
 199
 200   /* All bytes accessed from the address fall in the offset range
 201      [MIN_OFFSET, MAX_OFFSET).  */
 202   HOST_WIDE_INT min_offset, max_offset;
 203 };
 204
 205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
 206 struct address_info_hasher : nofree_ptr_hash <address_info>
 207 {
 208   static hashval_t hash (const address_info *);
 209   static bool equal (const address_info *, const address_info *);
 210 };
 211
 212 /* Information about the versioning we'd like to apply to a loop.  */
 213 class loop_info
 214 {
 215 public:
 216   bool worth_versioning_p () const;
 217
 218   /* True if we've decided not to version this loop.  The remaining
 219      fields are meaningless if so.  */
 220   bool rejected_p;
 221
 222   /* True if at least one subloop of this loop benefits from versioning.  */
 223   bool subloops_benefit_p;
 224
 225   /* An estimate of the total number of instructions in the loop,
 226      excluding those in subloops that benefit from versioning.  */
 227   unsigned int num_insns;
 228
 229   /* The outermost loop that can handle all the version checks
 230      described below.  */
 231   class loop *outermost;
 232
 233   /* The first entry in the list of blocks that belong to this loop
 234      (and not to subloops).  m_next_block_in_loop provides the chain
 235      pointers for the list.  */
 236   basic_block block_list;
 237
 238   /* We'd like to version the loop for the case in which these SSA names
 239      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
 240   bitmap_head unity_names;
 241
 242   /* If versioning succeeds, this points the version of the loop that
 243      assumes the version conditions holds.  */
 244   class loop *optimized_loop;
 245 };
 246
 247 /* The main pass structure.  */
 248 class loop_versioning
 249 {
 250 public:
 251   loop_versioning (function *);
 252   ~loop_versioning ();
 253   unsigned int run ();
 254
 255 private:
 256   /* Used to walk the dominator tree to find loop versioning conditions
 257      that are always false.  */
 258   class lv_dom_walker : public dom_walker
 259   {
 260   public:
 261     lv_dom_walker (loop_versioning &);
 262
 263     edge before_dom_children (basic_block) FINAL OVERRIDE;
 264     void after_dom_children (basic_block) FINAL OVERRIDE;
 265
 266   private:
 267     /* The parent pass.  */
 268     loop_versioning &m_lv;
 269
 270     /* Used to build context-dependent range information.  */
 271     evrp_range_analyzer m_range_analyzer;
 272   };
 273
 274   /* Used to simplify statements based on conditions that are established
 275      by the version checks.  */
 276   class name_prop : public substitute_and_fold_engine
 277   {
 278   public:
 279     name_prop (loop_info &li) : m_li (li) {}
 280     tree value_of_expr (tree name, gimple *) FINAL OVERRIDE;
 281
 282   private:
 283     /* Information about the versioning we've performed on the loop.  */
 284     loop_info &m_li;
 285   };
 286
 287   loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
 288
 289   unsigned int max_insns_for_loop (class loop *);
 290   bool expensive_stmt_p (gimple *);
 291
 292   void version_for_unity (gimple *, tree);
 293   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
 294                                 unsigned HOST_WIDE_INT * = 0);
 295   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
 296   bool multiply_term_by (address_term_info &, tree);
 297   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
 298   void dump_inner_likelihood (address_info &, address_term_info &);
 299   void analyze_stride (address_info &, address_term_info &,
 300                        tree, class loop *);
 301   bool find_per_loop_multiplication (address_info &, address_term_info &);
 302   bool analyze_term_using_scevs (address_info &, address_term_info &);
 303   void analyze_arbitrary_term (address_info &, address_term_info &);
 304   void analyze_address_fragment (address_info &);
 305   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
 306                                 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
 307   void analyze_expr (gimple *, tree);
 308   bool analyze_block (basic_block);
 309   bool analyze_blocks ();
 310
 311   void prune_loop_conditions (class loop *, vr_values *);
 312   bool prune_conditions ();
 313
 314   void merge_loop_info (class loop *, class loop *);
 315   void add_loop_to_queue (class loop *);
 316   bool decide_whether_loop_is_versionable (class loop *);
 317   bool make_versioning_decisions ();
 318
 319   bool version_loop (class loop *);
 320   void implement_versioning_decisions ();
 321
 322   /* The function we're optimizing.  */
 323   function *m_fn;
 324
 325   /* The obstack to use for all pass-specific bitmaps.  */
 326   bitmap_obstack m_bitmap_obstack;
 327
 328   /* An obstack to use for general allocation.  */
 329   obstack m_obstack;
 330
 331   /* The number of loops in the function.  */
 332   unsigned int m_nloops;
 333
 334   /* The total number of loop version conditions we've found.  */
 335   unsigned int m_num_conditions;
 336
 337   /* Assume that an address fragment of the form i * stride * scale
 338      (for variable stride and constant scale) will not benefit from
 339      versioning for stride == 1 when scale is greater than this value.  */
 340   unsigned HOST_WIDE_INT m_maximum_scale;
 341
 342   /* Information about each loop.  */
 343   auto_vec<loop_info> m_loops;
 344
 345   /* Used to form a linked list of blocks that belong to a loop,
 346      started by loop_info::block_list.  */
 347   auto_vec<basic_block> m_next_block_in_loop;
 348
 349   /* The list of loops that we've decided to version.  */
 350   auto_vec<class loop *> m_loops_to_version;
 351
 352   /* A table of addresses in the current loop, keyed off their values
 353      but not their offsets.  */
 354   hash_table <address_info_hasher> m_address_table;
 355
 356   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
 357   auto_vec <address_info *, 32> m_address_list;
 358 };
 359
 360 /* If EXPR is an SSA name and not a default definition, return the
 361    defining statement, otherwise return null.  */
 362
 363 static gimple *
 364 maybe_get_stmt (tree expr)
 365 {
 366   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
 367     return SSA_NAME_DEF_STMT (expr);
 368   return NULL;
 369 }
 370
 371 /* Like maybe_get_stmt, but also return null if the defining
 372    statement isn't an assignment.  */
 373
 374 static gassign *
 375 maybe_get_assign (tree expr)
 376 {
 377   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
 378 }
 379
 380 /* Return true if this pass should look through a cast of expression FROM
 381    to type TYPE when analyzing pieces of an address.  */
 382
 383 static bool
 384 look_through_cast_p (tree type, tree from)
 385 {
 386   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
 387           && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
 388 }
 389
 390 /* Strip all conversions of integers or pointers from EXPR, regardless
 391    of whether the conversions are nops.  This is useful in the context
 392    of this pass because we're not trying to fold or simulate the
 393    expression; we just want to see how it's structured.  */
 394
 395 static tree
 396 strip_casts (tree expr)
 397 {
 398   const unsigned int MAX_NITERS = 4;
 399
 400   tree type = TREE_TYPE (expr);
 401   while (CONVERT_EXPR_P (expr)
 402          && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
 403     expr = TREE_OPERAND (expr, 0);
 404
 405   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
 406     {
 407       gassign *assign = maybe_get_assign (expr);
 408       if (assign
 409           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
 410           && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
 411         expr = gimple_assign_rhs1 (assign);
 412       else
 413         break;
 414     }
 415   return expr;
 416 }
 417
 418 /* Compare two address_term_infos in the same address_info.  */
 419
 420 static int
 421 compare_address_terms (const void *a_uncast, const void *b_uncast)
 422 {
 423   const address_term_info *a = (const address_term_info *) a_uncast;
 424   const address_term_info *b = (const address_term_info *) b_uncast;
 425
 426   if (a->expr != b->expr)
 427     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
 428
 429   if (a->multiplier != b->multiplier)
 430     return a->multiplier < b->multiplier ? -1 : 1;
 431
 432   return 0;
 433 }
 434
 435 /* Dump ADDRESS using flags FLAGS.  */
 436
 437 static void
 438 dump_address_info (dump_flags_t flags, address_info &address)
 439 {
 440   if (address.base)
 441     dump_printf (flags, "%T + ", address.base);
 442   for (unsigned int i = 0; i < address.terms.length (); ++i)
 443     {
 444       if (i != 0)
 445         dump_printf (flags, " + ");
 446       dump_printf (flags, "%T", address.terms[i].expr);
 447       if (address.terms[i].multiplier != 1)
 448         dump_printf (flags, " * %wd", address.terms[i].multiplier);
 449     }
 450   dump_printf (flags, " + [%wd, %wd]",
 451                address.min_offset, address.max_offset - 1);
 452 }
 453
 454 /* Hash an address_info based on its base and terms.  */
 455
 456 hashval_t
 457 address_info_hasher::hash (const address_info *info)
 458 {
 459   inchash::hash hash;
 460   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
 461   hash.add_int (info->terms.length ());
 462   for (unsigned int i = 0; i < info->terms.length (); ++i)
 463     {
 464       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
 465       hash.add_hwi (info->terms[i].multiplier);
 466     }
 467   return hash.end ();
 468 }
 469
 470 /* Return true if two address_infos have equal bases and terms.  Other
 471    properties might be different (such as the statement or constant
 472    offset range).  */
 473
 474 bool
 475 address_info_hasher::equal (const address_info *a, const address_info *b)
 476 {
 477   if (a->base != b->base
 478       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
 479     return false;
 480
 481   if (a->terms.length () != b->terms.length ())
 482     return false;
 483
 484   for (unsigned int i = 0; i < a->terms.length (); ++i)
 485     if (a->terms[i].expr != b->terms[i].expr
 486         || a->terms[i].multiplier != b->terms[i].multiplier)
 487       return false;
 488
 489   return true;
 490 }
 491
 492 /* Return true if we want to version the loop, i.e. if we have a
 493    specific reason for doing so and no specific reason not to.  */
 494
 495 bool
 496 loop_info::worth_versioning_p () const
 497 {
 498   return (!rejected_p
 499           && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
 500 }
 501
 502 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
 503   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
 504 {
 505 }
 506
 507 /* Process BB before processing the blocks it dominates.  */
 508
 509 edge
 510 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 511 {
 512   m_range_analyzer.enter (bb);
 513
 514   if (bb == bb->loop_father->header)
 515     m_lv.prune_loop_conditions (bb->loop_father, &m_range_analyzer);
 516
 517   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 518        gsi_next (&si))
 519     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
 520
 521   return NULL;
 522 }
 523
 524 /* Process BB after processing the blocks it dominates.  */
 525
 526 void
 527 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
 528 {
 529   m_range_analyzer.leave (bb);
 530 }
 531
 532 /* Decide whether to replace VAL with a new value in a versioned loop.
 533    Return the new value if so, otherwise return null.  */
 534
 535 tree
 536 loop_versioning::name_prop::value_of_expr (tree val, gimple *)
 537 {
 538   if (TREE_CODE (val) == SSA_NAME
 539       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
 540     return build_one_cst (TREE_TYPE (val));
 541   return NULL_TREE;
 542 }
 543
 544 /* Initialize the structure to optimize FN.  */
 545
 546 loop_versioning::loop_versioning (function *fn)
 547   : m_fn (fn),
 548     m_nloops (number_of_loops (fn)),
 549     m_num_conditions (0),
 550     m_address_table (31)
 551 {
 552   bitmap_obstack_initialize (&m_bitmap_obstack);
 553   gcc_obstack_init (&m_obstack);
 554
 555   /* Initialize the loop information.  */
 556   m_loops.safe_grow_cleared (m_nloops, true);
 557   for (unsigned int i = 0; i < m_nloops; ++i)
 558     {
 559       m_loops[i].outermost = get_loop (m_fn, 0);
 560       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
 561     }
 562
 563   /* Initialize the list of blocks that belong to each loop.  */
 564   unsigned int nbbs = last_basic_block_for_fn (fn);
 565   m_next_block_in_loop.safe_grow (nbbs, true);
 566   basic_block bb;
 567   FOR_EACH_BB_FN (bb, fn)
 568     {
 569       loop_info &li = get_loop_info (bb->loop_father);
 570       m_next_block_in_loop[bb->index] = li.block_list;
 571       li.block_list = bb;
 572     }
 573
 574   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
 575      unvectorizable code, since it is the largest size that can be
 576      handled efficiently by scalar code.  omp_max_vf calculates the
 577      maximum number of bytes in a vector, when such a value is relevant
 578      to loop optimization.  */
 579   m_maximum_scale = estimated_poly_value (omp_max_vf ());
 580   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 581 }
 582
 583 loop_versioning::~loop_versioning ()
 584 {
 585   bitmap_obstack_release (&m_bitmap_obstack);
 586   obstack_free (&m_obstack, NULL);
 587 }
 588
 589 /* Return the maximum number of instructions allowed in LOOP before
 590    it becomes too big for versioning.
 591
 592    There are separate limits for inner and outer loops.  The limit for
 593    inner loops applies only to loops that benefit directly from versioning.
 594    The limit for outer loops applies to all code in the outer loop and
 595    its subloops that *doesn't* benefit directly from versioning; such code
 596    would be "taken along for the ride".  The idea is that if the cost of
 597    the latter is small, it is better to version outer loops rather than
 598    inner loops, both to reduce the number of repeated checks and to enable
 599    more of the loop nest to be optimized as a natural nest (e.g. by loop
 600    interchange or outer-loop vectorization).  */
 601
 602 unsigned int
 603 loop_versioning::max_insns_for_loop (class loop *loop)
 604 {
 605   return (loop->inner
 606           ? param_loop_versioning_max_outer_insns
 607           : param_loop_versioning_max_inner_insns);
 608 }
 609
 610 /* Return true if for cost reasons we should avoid versioning any loop
 611    that contains STMT.
 612
 613    Note that we don't need to check whether versioning is invalid for
 614    correctness reasons, since the versioning process does that for us.
 615    The conditions involved are too rare to be worth duplicating here.  */
 616
 617 bool
 618 loop_versioning::expensive_stmt_p (gimple *stmt)
 619 {
 620   if (gcall *call = dyn_cast <gcall *> (stmt))
 621     /* Assume for now that the time spent in an "expensive" call would
 622        overwhelm any saving from versioning.  */
 623     return !gimple_inexpensive_call_p (call);
 624   return false;
 625 }
 626
 627 /* Record that we want to version the loop that contains STMT for the
 628    case in which SSA name NAME is equal to 1.  We already know that NAME
 629    is invariant in the loop.  */
 630
 631 void
 632 loop_versioning::version_for_unity (gimple *stmt, tree name)
 633 {
 634   class loop *loop = loop_containing_stmt (stmt);
 635   loop_info &li = get_loop_info (loop);
 636
 637   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
 638     {
 639       /* This is the first time we've wanted to version LOOP for NAME.
 640          Keep track of the outermost loop that can handle all versioning
 641          checks in LI.  */
 642       class loop *outermost
 643         = outermost_invariant_loop_for_expr (loop, name);
 644       if (loop_depth (li.outermost) < loop_depth (outermost))
 645         li.outermost = outermost;
 646
 647       if (dump_enabled_p ())
 648         {
 649           dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
 650                            " for when %T == 1", name);
 651           if (outermost == loop)
 652             dump_printf (MSG_NOTE, "; cannot hoist check further");
 653           else
 654             {
 655               dump_printf (MSG_NOTE, "; could implement the check at loop"
 656                            " depth %d", loop_depth (outermost));
 657               if (loop_depth (li.outermost) > loop_depth (outermost))
 658                 dump_printf (MSG_NOTE, ", but other checks only allow"
 659                              " a depth of %d", loop_depth (li.outermost));
 660             }
 661           dump_printf (MSG_NOTE, "\n");
 662         }
 663
 664       m_num_conditions += 1;
 665     }
 666   else
 667     {
 668       /* This is a duplicate request.  */
 669       if (dump_enabled_p ())
 670         dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
 671                          " loop for when %T == 1\n", name);
 672     }
 673 }
 674
 675 /* Return true if OP1_TREE is constant and if in principle it is worth
 676    versioning an address fragment of the form:
 677
 678      i * OP1_TREE * OP2 * stride
 679
 680    for the case in which stride == 1.  This in practice means testing
 681    whether:
 682
 683      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
 684
 685    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
 686
 687 bool
 688 loop_versioning::acceptable_multiplier_p (tree op1_tree,
 689                                           unsigned HOST_WIDE_INT op2,
 690                                           unsigned HOST_WIDE_INT *result)
 691 {
 692   if (tree_fits_uhwi_p (op1_tree))
 693     {
 694       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
 695       /* The first part checks for overflow.  */
 696       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
 697         {
 698           if (result)
 699             *result = op1 * op2;
 700           return true;
 701         }
 702     }
 703   return false;
 704 }
 705
 706 /* Return true if it is worth using loop versioning on a memory access
 707    of type TYPE.  Store the size of the access in *SIZE if so.  */
 708
 709 bool
 710 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
 711 {
 712   return (TYPE_SIZE_UNIT (type)
 713           && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
 714 }
 715
 716 /* See whether OP is constant and whether we can multiply TERM by that
 717    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
 718    TERM if so.  */
 719
 720 bool
 721 loop_versioning::multiply_term_by (address_term_info &term, tree op)
 722 {
 723   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
 724 }
 725
 726 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
 727    is likely to be indexing an innermost dimension, returning the result
 728    as an INNER_* probability.  */
 729
 730 inner_likelihood
 731 loop_versioning::get_inner_likelihood (tree stride,
 732                                        unsigned HOST_WIDE_INT multiplier)
 733 {
 734   const unsigned int MAX_NITERS = 8;
 735
 736   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
 737      least one of those values is likely to be for the innermost dimension.
 738      Record in UNLIKELY_P if at least one of those values is unlikely to be
 739      for the innermost dimension.
 740
 741      E.g. for:
 742
 743        stride = cond ? a * b : 1
 744
 745      we should treat STRIDE as being a likely inner dimension, since
 746      we know that it is 1 under at least some circumstances.  (See the
 747      Fortran example below.)  However:
 748
 749        stride = a * b
 750
 751      on its own is unlikely to be for the innermost dimension, since
 752      that would require both a and b to be 1 at runtime.  */
 753   bool unlikely_p = false;
 754   tree worklist[MAX_NITERS];
 755   unsigned int length = 0;
 756   worklist[length++] = stride;
 757   for (unsigned int i = 0; i < length; ++i)
 758     {
 759       tree expr = worklist[i];
 760
 761       if (CONSTANT_CLASS_P (expr))
 762         {
 763           /* See if EXPR * MULTIPLIER would be consistent with an individual
 764              access or a small grouped access.  */
 765           if (acceptable_multiplier_p (expr, multiplier))
 766             return INNER_LIKELY;
 767           else
 768             unlikely_p = true;
 769         }
 770       else if (gimple *stmt = maybe_get_stmt (expr))
 771         {
 772           /* If EXPR is set by a PHI node, queue its arguments in case
 773              we find one that is consistent with an inner dimension.
 774
 775              An important instance of this is the Fortran handling of array
 776              descriptors, which calculates the stride of the inner dimension
 777              using a PHI equivalent of:
 778
 779                 raw_stride = a.dim[0].stride;
 780                 stride = raw_stride != 0 ? raw_stride : 1;
 781
 782              (Strides for outer dimensions do not treat 0 specially.)  */
 783           if (gphi *phi = dyn_cast <gphi *> (stmt))
 784             {
 785               unsigned int nargs = gimple_phi_num_args (phi);
 786               for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
 787                 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
 788             }
 789           /* If the value is set by an assignment, expect it to be read
 790              from memory (such as an array descriptor) rather than be
 791              calculated.  */
 792           else if (gassign *assign = dyn_cast <gassign *> (stmt))
 793             {
 794               if (!gimple_assign_load_p (assign))
 795                 unlikely_p = true;
 796             }
 797           /* Things like calls don't really tell us anything.  */
 798         }
 799     }
 800
 801   /* We didn't find any possible values of STRIDE that were likely to be
 802      for the innermost dimension.  If we found one that was actively
 803      unlikely to be for the innermost dimension, assume that that applies
 804      to STRIDE too.  */
 805   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
 806 }
 807
 808 /* Dump the likelihood that TERM's stride is for the innermost dimension.
 809    ADDRESS is the address that contains TERM.  */
 810
 811 void
 812 loop_versioning::dump_inner_likelihood (address_info &address,
 813                                         address_term_info &term)
 814 {
 815   if (term.inner_likelihood == INNER_LIKELY)
 816     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
 817                      " innermost dimension\n", term.stride);
 818   else if (term.inner_likelihood == INNER_UNLIKELY)
 819     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
 820                      " innermost dimension\n", term.stride);
 821   else
 822     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
 823                      " is the innermost dimension\n", term.stride);
 824 }
 825
 826 /* The caller has identified that STRIDE is the stride of interest
 827    in TERM, and that the stride is applied in OP_LOOP.  Record this
 828    information in TERM, deciding whether STRIDE is likely to be for
 829    the innermost dimension of an array and whether it represents a
 830    versioning opportunity.  ADDRESS is the address that contains TERM.  */
 831
 832 void
 833 loop_versioning::analyze_stride (address_info &address,
 834                                  address_term_info &term,
 835                                  tree stride, class loop *op_loop)
 836 {
 837   term.stride = stride;
 838
 839   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
 840   if (dump_enabled_p ())
 841     dump_inner_likelihood (address, term);
 842
 843   /* To be a versioning opportunity we require:
 844
 845      - The multiplier applied by TERM is equal to the access size,
 846        so that when STRIDE is 1, the accesses in successive loop
 847        iterations are consecutive.
 848
 849        This is deliberately conservative.  We could relax it to handle
 850        other cases (such as those with gaps between iterations) if we
 851        find any real testcases for which it's useful.
 852
 853      - the stride is applied in the same loop as STMT rather than
 854        in an outer loop.  Although versioning for strides applied in
 855        outer loops could help in some cases -- such as enabling
 856        more loop interchange -- the savings are much lower than for
 857        inner loops.
 858
 859      - the stride is an SSA name that is invariant in STMT's loop,
 860        since otherwise versioning isn't possible.  */
 861   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
 862   if (term.multiplier == access_size
 863       && address.loop == op_loop
 864       && TREE_CODE (stride) == SSA_NAME
 865       && expr_invariant_in_loop_p (address.loop, stride))
 866     {
 867       term.versioning_opportunity_p = true;
 868       if (dump_enabled_p ())
 869         dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
 870                          " opportunity\n", stride);
 871     }
 872 }
 873
 874 /* See whether address term TERM (which belongs to ADDRESS) is the result
 875    of multiplying a varying SSA name by a loop-invariant SSA name.
 876    Return true and update TERM if so.
 877
 878    This handles both cases that SCEV might handle, such as:
 879
 880      for (int i = 0; i < n; ++i)
 881        res += a[i * stride];
 882
 883    and ones in which the term varies arbitrarily between iterations, such as:
 884
 885      for (int i = 0; i < n; ++i)
 886        res += a[index[i] * stride];  */
 887
 888 bool
 889 loop_versioning::find_per_loop_multiplication (address_info &address,
 890                                                address_term_info &term)
 891 {
 892   gassign *mult = maybe_get_assign (term.expr);
 893   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
 894     return false;
 895
 896   class loop *mult_loop = loop_containing_stmt (mult);
 897   if (!loop_outer (mult_loop))
 898     return false;
 899
 900   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
 901   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
 902   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
 903     return false;
 904
 905   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
 906   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
 907   if (invariant1_p == invariant2_p)
 908     return false;
 909
 910   /* Make sure that the loop invariant is OP2 rather than OP1.  */
 911   if (invariant1_p)
 912     std::swap (op1, op2);
 913
 914   if (dump_enabled_p ())
 915     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
 916                      " * loop-invariant %T\n", term.expr, op1, op2);
 917   analyze_stride (address, term, op2, mult_loop);
 918   return true;
 919 }
 920
 921 /* Try to use scalar evolutions to find an address stride for TERM,
 922    which belongs to ADDRESS.  Return true and update TERM if so.
 923
 924    Here we are interested in any evolution information we can find,
 925    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
 926    an outer loop obviously iterates over the inner dimension of an array,
 927    that information can help us eliminate worthless versioning opportunities
 928    in inner loops.  */
 929
 930 bool
 931 loop_versioning::analyze_term_using_scevs (address_info &address,
 932                                            address_term_info &term)
 933 {
 934   gimple *setter = maybe_get_stmt (term.expr);
 935   if (!setter)
 936     return false;
 937
 938   class loop *wrt_loop = loop_containing_stmt (setter);
 939   if (!loop_outer (wrt_loop))
 940     return false;
 941
 942   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
 943   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
 944     {
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, address.stmt,
 947                          "address term %T = %T\n", term.expr, chrec);
 948
 949       /* Peel casts and accumulate constant multiplications, up to the
 950          limit allowed by M_MAXIMUM_SCALE.  */
 951       tree stride = strip_casts (CHREC_RIGHT (chrec));
 952       while (TREE_CODE (stride) == MULT_EXPR
 953              && multiply_term_by (term, TREE_OPERAND (stride, 1)))
 954         stride = strip_casts (TREE_OPERAND (stride, 0));
 955
 956       gassign *assign;
 957       while ((assign = maybe_get_assign (stride))
 958              && gimple_assign_rhs_code (assign) == MULT_EXPR
 959              && multiply_term_by (term, gimple_assign_rhs2 (assign)))
 960         {
 961           if (dump_enabled_p ())
 962             dump_printf_loc (MSG_NOTE, address.stmt,
 963                              "looking through %G", assign);
 964           stride = strip_casts (gimple_assign_rhs1 (assign));
 965         }
 966
 967       analyze_stride (address, term, stride, get_chrec_loop (chrec));
 968       return true;
 969     }
 970
 971   return false;
 972 }
 973
 974 /* Address term TERM is an arbitrary term that provides no versioning
 975    opportunities.  Analyze it to see whether it contains any likely
 976    inner strides, so that we don't mistakenly version for other
 977    (less likely) candidates.
 978
 979    This copes with invariant innermost indices such as:
 980
 981      x(i, :) = 100
 982
 983    where the "i" component of the address is invariant in the loop
 984    but provides the real inner stride.
 985
 986    ADDRESS is the address that contains TERM.  */
 987
 988 void
 989 loop_versioning::analyze_arbitrary_term (address_info &address,
 990                                          address_term_info &term)
 991
 992 {
 993   /* A multiplication offers two potential strides.  Pick the one that
 994      is most likely to be an innermost stride.  */
 995   tree expr = term.expr, alt = NULL_TREE;
 996   gassign *mult = maybe_get_assign (expr);
 997   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
 998     {
 999       expr = strip_casts (gimple_assign_rhs1 (mult));
1000       alt = strip_casts (gimple_assign_rhs2 (mult));
1001     }
1002   term.stride = expr;
1003   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
1004   if (alt)
1005     {
1006       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
1007       if (alt_l > term.inner_likelihood)
1008         {
1009           term.stride = alt;
1010           term.inner_likelihood = alt_l;
1011         }
1012     }
1013   if (dump_enabled_p ())
1014     dump_inner_likelihood (address, term);
1015 }
1016
1017 /* Try to identify loop strides in ADDRESS and try to choose realistic
1018    versioning opportunities based on these strides.
1019
1020    The main difficulty here isn't finding strides that could be used
1021    in a version check (that's pretty easy).  The problem instead is to
1022    avoid versioning for some stride S that is unlikely ever to be 1 at
1023    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1024    code bloat, while adding S == 1 to more realistic version conditions
1025    would lose the optimisation opportunity offered by those other conditions.
1026
1027    For example, versioning for a stride of 1 in the Fortran code:
1028
1029      integer :: a(:,:)
1030      a(1,:) = 1
1031
1032    is not usually a good idea, since the assignment is iterating over
1033    an outer dimension and is relatively unlikely to have a stride of 1.
1034    (It isn't impossible, since the inner dimension might be 1, or the
1035    array might be transposed.)  Similarly, in:
1036
1037      integer :: a(:,:), b(:,:)
1038      b(:,1) = a(1,:)
1039
1040    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1041    Versioning for when both strides are 1 would lose most of the benefit
1042    of versioning for b's access.
1043
1044    The approach we take is as follows:
1045
1046    - Analyze each term to see whether it has an identifiable stride,
1047      regardless of which loop applies the stride.
1048
1049    - Evaluate the likelihood that each such stride is for the innermost
1050      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1051
1052    - If there is a single "likely" innermost stride, and that stride is
1053      applied in the loop that contains STMT, version the loop for when the
1054      stride is 1.  This deals with the cases in which we're fairly
1055      confident of doing the right thing, such as the b(:,1) reference above.
1056
1057    - If there are no "likely" innermost strides, and the loop that contains
1058      STMT uses a stride that we rated as "don't know", version for when
1059      that stride is 1.  This is principally used for C code such as:
1060
1061        for (int i = 0; i < n; ++i)
1062          a[i * x] = ...;
1063
1064      and:
1065
1066        for (int j = 0; j < n; ++j)
1067          for (int i = 0; i < n; ++i)
1068            a[i * x + j * y] = ...;
1069
1070      where nothing in the way "x" and "y" are set gives a hint as to
1071      whether "i" iterates over the innermost dimension of the array.
1072      In these situations it seems reasonable to assume the
1073      programmer has nested the loops appropriately (although of course
1074      there are examples like GEMM in which this assumption doesn't hold
1075      for all accesses in the loop).
1076
1077      This case is also useful for the Fortran equivalent of the
1078      above C code.  */
1079
1080 void
1081 loop_versioning::analyze_address_fragment (address_info &address)
1082 {
1083   if (dump_enabled_p ())
1084     {
1085       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1086       dump_address_info (MSG_NOTE, address);
1087       dump_printf (MSG_NOTE, "\n");
1088     }
1089
1090   /* Analyze each component of the sum to see whether it involves an
1091      apparent stride.
1092
1093      There is an overlap between the addresses that
1094      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1095      but the former is much cheaper than SCEV analysis, so try it first.  */
1096   for (unsigned int i = 0; i < address.terms.length (); ++i)
1097     if (!find_per_loop_multiplication (address, address.terms[i])
1098         && !analyze_term_using_scevs (address, address.terms[i])
1099         && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1100       analyze_arbitrary_term (address, address.terms[i]);
1101
1102   /* Check for strides that are likely to be for the innermost dimension.
1103
1104      1. If there is a single likely inner stride, if it is an SSA name,
1105         and if it is worth versioning the loop for when the SSA name
1106         equals 1, record that we want to do so.
1107
1108      2. Otherwise, if there any likely inner strides, bail out.  This means
1109         one of:
1110
1111         (a) There are multiple likely inner strides.  This suggests we're
1112             confused and be can't be confident of doing the right thing.
1113
1114         (b) There is a single likely inner stride and it is a constant
1115             rather than an SSA name.  This can mean either that the access
1116             is a natural one without any variable strides, such as:
1117
1118               for (int i = 0; i < n; ++i)
1119                 a[i] += 1;
1120
1121             or that a variable stride is applied to an outer dimension,
1122             such as:
1123
1124               for (int i = 0; i < n; ++i)
1125                 for (int j = 0; j < n; ++j)
1126                   a[j * stride][i] += 1;
1127
1128         (c) There is a single likely inner stride, and it is an SSA name,
1129             but it isn't a worthwhile versioning opportunity.  This usually
1130             means that the variable stride is applied by an outer loop,
1131             such as:
1132
1133               for (int i = 0; i < n; ++i)
1134                 for (int j = 0; j < n; ++j)
1135                   a[j][i * stride] += 1;
1136
1137             or (using an example with a more natural loop nesting):
1138
1139               for (int i = 0; i < n; ++i)
1140                 for (int j = 0; j < n; ++j)
1141                   a[i][j] += b[i * stride];
1142
1143             in cases where b[i * stride] cannot (yet) be hoisted for
1144             aliasing reasons.
1145
1146      3. If there are no likely inner strides, fall through to the next
1147         set of checks.
1148
1149      Pointer equality is enough to check for uniqueness in (1), since we
1150      only care about SSA names.  */
1151   tree chosen_stride = NULL_TREE;
1152   tree version_stride = NULL_TREE;
1153   for (unsigned int i = 0; i < address.terms.length (); ++i)
1154     if (chosen_stride != address.terms[i].stride
1155         && address.terms[i].inner_likelihood == INNER_LIKELY)
1156       {
1157         if (chosen_stride)
1158           return;
1159         chosen_stride = address.terms[i].stride;
1160         if (address.terms[i].versioning_opportunity_p)
1161           version_stride = chosen_stride;
1162       }
1163
1164   /* If there are no likely inner strides, see if there is a single
1165      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1166      See the comment above the function for the cases that this code
1167      handles.  */
1168   if (!chosen_stride)
1169     for (unsigned int i = 0; i < address.terms.length (); ++i)
1170       if (version_stride != address.terms[i].stride
1171           && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1172           && address.terms[i].versioning_opportunity_p)
1173         {
1174           if (version_stride)
1175             return;
1176           version_stride = address.terms[i].stride;
1177         }
1178
1179   if (version_stride)
1180     version_for_unity (address.stmt, version_stride);
1181 }
1182
1183 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1184    TYPE_SIZE bytes and record this address fragment for later processing.
1185    STMT is the statement that contains the address.  */
1186
1187 void
1188 loop_versioning::record_address_fragment (gimple *stmt,
1189                                           unsigned HOST_WIDE_INT type_size,
1190                                           tree expr,
1191                                           unsigned HOST_WIDE_INT multiplier,
1192                                           HOST_WIDE_INT offset)
1193 {
1194   /* We're only interested in computed values.  */
1195   if (TREE_CODE (expr) != SSA_NAME)
1196     return;
1197
1198   /* Quick exit if no part of the address is calculated in STMT's loop,
1199      since such addresses have no versioning opportunities.  */
1200   class loop *loop = loop_containing_stmt (stmt);
1201   if (expr_invariant_in_loop_p (loop, expr))
1202     return;
1203
1204   /* Set up an address_info for EXPR * MULTIPLIER.  */
1205   address_info *address = XOBNEW (&m_obstack, address_info);
1206   new (address) address_info;
1207   address->stmt = stmt;
1208   address->loop = loop;
1209   address->base = NULL_TREE;
1210   address->terms.quick_grow (1);
1211   address->terms[0].expr = expr;
1212   address->terms[0].multiplier = multiplier;
1213   address->terms[0].stride = NULL_TREE;
1214   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1215   address->terms[0].versioning_opportunity_p = false;
1216   address->min_offset = offset;
1217
1218   /* Peel apart the expression into a sum of address_terms, where each
1219      term is multiplied by a constant.  Treat a + b and a - b the same,
1220      since it doesn't matter for our purposes whether an address is
1221      increasing or decreasing.  Distribute (a + b) * constant into
1222      a * constant + b * constant.
1223
1224      We don't care which loop each term belongs to, since we want to
1225      examine as many candidate strides as possible when determining
1226      which is likely to be for the innermost dimension.  We therefore
1227      don't limit the search to statements in STMT's loop.  */
1228   for (unsigned int i = 0; i < address->terms.length (); )
1229     {
1230       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1231         {
1232           tree_code code = gimple_assign_rhs_code (assign);
1233           if (code == PLUS_EXPR
1234               || code == POINTER_PLUS_EXPR
1235               || code == MINUS_EXPR)
1236             {
1237               tree op1 = gimple_assign_rhs1 (assign);
1238               tree op2 = gimple_assign_rhs2 (assign);
1239               if (TREE_CODE (op2) == INTEGER_CST)
1240                 {
1241                   address->terms[i].expr = strip_casts (op1);
1242                   /* This is heuristic only, so don't worry about truncation
1243                      or overflow.  */
1244                   address->min_offset += (TREE_INT_CST_LOW (op2)
1245                                           * address->terms[i].multiplier);
1246                   continue;
1247                 }
1248               else if (address->terms.length () < address_info::MAX_TERMS)
1249                 {
1250                   unsigned int j = address->terms.length ();
1251                   address->terms.quick_push (address->terms[i]);
1252                   address->terms[i].expr = strip_casts (op1);
1253                   address->terms[j].expr = strip_casts (op2);
1254                   continue;
1255                 }
1256             }
1257           if (code == MULT_EXPR)
1258             {
1259               tree op1 = gimple_assign_rhs1 (assign);
1260               tree op2 = gimple_assign_rhs2 (assign);
1261               if (multiply_term_by (address->terms[i], op2))
1262                 {
1263                   address->terms[i].expr = strip_casts (op1);
1264                   continue;
1265                 }
1266             }
1267           if (CONVERT_EXPR_CODE_P (code))
1268             {
1269               tree op1 = gimple_assign_rhs1 (assign);
1270               address->terms[i].expr = strip_casts (op1);
1271               continue;
1272             }
1273         }
1274       i += 1;
1275     }
1276
1277   /* Peel off any symbolic pointer.  */
1278   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1279       && address->terms[0].multiplier == 1)
1280     {
1281       if (address->terms.length () == 1)
1282         {
1283           obstack_free (&m_obstack, address);
1284           return;
1285         }
1286       address->base = address->terms[0].expr;
1287       address->terms.ordered_remove (0);
1288     }
1289
1290   /* Require all remaining terms to be SSA names.  (This could be false
1291      for unfolded statements, but they aren't worth dealing with.)  */
1292   for (unsigned int i = 0; i < address->terms.length (); ++i)
1293     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1294       {
1295         obstack_free (&m_obstack, address);
1296         return;
1297       }
1298
1299   /* The loop above set MIN_OFFSET based on the first byte of the
1300      referenced data.  Calculate the end + 1.  */
1301   address->max_offset = address->min_offset + type_size;
1302
1303   /* Put the terms into a canonical order for the hash table lookup below.  */
1304   address->terms.qsort (compare_address_terms);
1305
1306   if (dump_enabled_p ())
1307     {
1308       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1309       if (multiplier != 1)
1310         dump_printf (MSG_NOTE, " * %wd", multiplier);
1311       dump_printf (MSG_NOTE, " = ");
1312       dump_address_info (MSG_NOTE, *address);
1313       dump_printf (MSG_NOTE, "\n");
1314     }
1315
1316   /* Pool address information with the same terms (but potentially
1317      different offsets).  */
1318   address_info **slot = m_address_table.find_slot (address, INSERT);
1319   if (address_info *old_address = *slot)
1320     {
1321       /* We've already seen an address with the same terms.  Extend the
1322          offset range to account for this access.  Doing this can paper
1323          over gaps, such as in:
1324
1325            a[i * stride * 4] + a[i * stride * 4 + 3];
1326
1327          where nothing references "+ 1" or "+ 2".  However, the vectorizer
1328          handles such gapped accesses without problems, so it's not worth
1329          trying to exclude them.  */
1330       if (old_address->min_offset > address->min_offset)
1331         old_address->min_offset = address->min_offset;
1332       if (old_address->max_offset < address->max_offset)
1333         old_address->max_offset = address->max_offset;
1334       obstack_free (&m_obstack, address);
1335     }
1336   else
1337     {
1338       /* This is the first time we've seen an address with these terms.  */
1339       *slot = address;
1340       m_address_list.safe_push (address);
1341     }
1342 }
1343
1344 /* Analyze expression EXPR, which occurs in STMT.  */
1345
1346 void
1347 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1348 {
1349   unsigned HOST_WIDE_INT type_size;
1350
1351   while (handled_component_p (expr))
1352     {
1353       /* See whether we can use versioning to avoid a multiplication
1354          in an array index.  */
1355       if (TREE_CODE (expr) == ARRAY_REF
1356           && acceptable_type_p (TREE_TYPE (expr), &type_size))
1357         record_address_fragment (stmt, type_size,
1358                                  TREE_OPERAND (expr, 1), type_size, 0);
1359       expr = TREE_OPERAND (expr, 0);
1360     }
1361
1362   /* See whether we can use versioning to avoid a multiplication
1363      in the pointer calculation of a MEM_REF.  */
1364   if (TREE_CODE (expr) == MEM_REF
1365       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1366     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1367                              /* This is heuristic only, so don't worry
1368                                 about truncation or overflow.  */
1369                              TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1370
1371   /* These would be easy to handle if they existed at this stage.  */
1372   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1373 }
1374
1375 /* Analyze all the statements in BB looking for useful version checks.
1376    Return true on success, false if something prevents the block from
1377    being versioned.  */
1378
1379 bool
1380 loop_versioning::analyze_block (basic_block bb)
1381 {
1382   class loop *loop = bb->loop_father;
1383   loop_info &li = get_loop_info (loop);
1384   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1385        gsi_next (&gsi))
1386     {
1387       gimple *stmt = gsi_stmt (gsi);
1388       if (is_gimple_debug (stmt))
1389         continue;
1390
1391       if (expensive_stmt_p (stmt))
1392         {
1393           if (dump_enabled_p ())
1394             dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1395                              " prevents versioning: %G", stmt);
1396           return false;
1397         }
1398
1399       /* Only look for direct versioning opportunities in inner loops
1400          since the benefit tends to be much smaller for outer loops.  */
1401       if (!loop->inner)
1402         {
1403           unsigned int nops = gimple_num_ops (stmt);
1404           for (unsigned int i = 0; i < nops; ++i)
1405             if (tree op = gimple_op (stmt, i))
1406               analyze_expr (stmt, op);
1407         }
1408
1409       /* The point of the instruction limit is to prevent excessive
1410          code growth, so this is a size-based estimate even though
1411          the optimization is aimed at speed.  */
1412       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1413     }
1414
1415   return true;
1416 }
1417
1418 /* Analyze all the blocks in the function, looking for useful version checks.
1419    Return true if we found one.  */
1420
1421 bool
1422 loop_versioning::analyze_blocks ()
1423 {
1424   AUTO_DUMP_SCOPE ("analyze_blocks",
1425                    dump_user_location_t::from_function_decl (m_fn->decl));
1426
1427   /* For now we don't try to version the whole function, although
1428      versioning at that level could be useful in some cases.  */
1429   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1430
1431   class loop *loop;
1432   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1433     {
1434       loop_info &linfo = get_loop_info (loop);
1435
1436       /* Ignore cold loops.  */
1437       if (!optimize_loop_for_speed_p (loop))
1438         linfo.rejected_p = true;
1439
1440       /* See whether an inner loop prevents versioning of this loop.  */
1441       if (!linfo.rejected_p)
1442         for (class loop *inner = loop->inner; inner; inner = inner->next)
1443           if (get_loop_info (inner).rejected_p)
1444             {
1445               linfo.rejected_p = true;
1446               break;
1447             }
1448
1449       /* If versioning the loop is still a possibility, examine the
1450          statements in the loop to look for versioning opportunities.  */
1451       if (!linfo.rejected_p)
1452         {
1453           void *start_point = obstack_alloc (&m_obstack, 0);
1454
1455           for (basic_block bb = linfo.block_list; bb;
1456                bb = m_next_block_in_loop[bb->index])
1457             if (!analyze_block (bb))
1458               {
1459                 linfo.rejected_p = true;
1460                 break;
1461             }
1462
1463           if (!linfo.rejected_p)
1464             {
1465               /* Process any queued address fragments, now that we have
1466                  complete grouping information.  */
1467               address_info *address;
1468               unsigned int i;
1469               FOR_EACH_VEC_ELT (m_address_list, i, address)
1470                 analyze_address_fragment (*address);
1471             }
1472
1473           m_address_table.empty ();
1474           m_address_list.truncate (0);
1475           obstack_free (&m_obstack, start_point);
1476         }
1477     }
1478
1479   return m_num_conditions != 0;
1480 }
1481
1482 /* Use the ranges in VRS to remove impossible versioning conditions from
1483    LOOP.  */
1484
1485 void
1486 loop_versioning::prune_loop_conditions (class loop *loop, vr_values *vrs)
1487 {
1488   loop_info &li = get_loop_info (loop);
1489
1490   int to_remove = -1;
1491   bitmap_iterator bi;
1492   unsigned int i;
1493   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1494     {
1495       tree name = ssa_name (i);
1496       const value_range_equiv *vr = vrs->get_value_range (name);
1497       if (vr && !vr->may_contain_p (build_one_cst (TREE_TYPE (name))))
1498         {
1499           if (dump_enabled_p ())
1500             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1501                              "%T can never be 1 in this loop\n", name);
1502
1503           if (to_remove >= 0)
1504             bitmap_clear_bit (&li.unity_names, to_remove);
1505           to_remove = i;
1506           m_num_conditions -= 1;
1507         }
1508     }
1509   if (to_remove >= 0)
1510     bitmap_clear_bit (&li.unity_names, to_remove);
1511 }
1512
1513 /* Remove any scheduled loop version conditions that will never be true.
1514    Return true if any remain.  */
1515
1516 bool
1517 loop_versioning::prune_conditions ()
1518 {
1519   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1520                    dump_user_location_t::from_function_decl (m_fn->decl));
1521
1522   calculate_dominance_info (CDI_DOMINATORS);
1523   lv_dom_walker dom_walker (*this);
1524   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1525   return m_num_conditions != 0;
1526 }
1527
1528 /* Merge the version checks for INNER into immediately-enclosing loop
1529    OUTER.  */
1530
1531 void
1532 loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1533 {
1534   loop_info &inner_li = get_loop_info (inner);
1535   loop_info &outer_li = get_loop_info (outer);
1536
1537   if (dump_enabled_p ())
1538     {
1539       bitmap_iterator bi;
1540       unsigned int i;
1541       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1542         if (!bitmap_bit_p (&outer_li.unity_names, i))
1543           dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1544                            "hoisting check that %T == 1 to outer loop\n",
1545                            ssa_name (i));
1546     }
1547
1548   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1549   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1550     outer_li.outermost = inner_li.outermost;
1551 }
1552
1553 /* Add LOOP to the queue of loops to version.  */
1554
1555 void
1556 loop_versioning::add_loop_to_queue (class loop *loop)
1557 {
1558   loop_info &li = get_loop_info (loop);
1559
1560   if (dump_enabled_p ())
1561     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1562                      "queuing this loop for versioning\n");
1563   m_loops_to_version.safe_push (loop);
1564
1565   /* Don't try to version superloops.  */
1566   li.rejected_p = true;
1567 }
1568
1569 /* Decide whether the cost model would allow us to version LOOP,
1570    either directly or as part of a parent loop, and return true if so.
1571    This does not imply that the loop is actually worth versioning in its
1572    own right, just that it would be valid to version it if something
1573    benefited.
1574
1575    We have already made this decision for all inner loops of LOOP.  */
1576
1577 bool
1578 loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1579 {
1580   loop_info &li = get_loop_info (loop);
1581
1582   if (li.rejected_p)
1583     return false;
1584
1585   /* Examine the decisions made for inner loops.  */
1586   for (class loop *inner = loop->inner; inner; inner = inner->next)
1587     {
1588       loop_info &inner_li = get_loop_info (inner);
1589       if (inner_li.rejected_p)
1590         {
1591           if (dump_enabled_p ())
1592             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1593                              "not versioning this loop because one of its"
1594                              " inner loops should not be versioned\n");
1595           return false;
1596         }
1597
1598       if (inner_li.worth_versioning_p ())
1599         li.subloops_benefit_p = true;
1600
1601       /* Accumulate the number of instructions from subloops that are not
1602          the innermost, or that don't benefit from versioning.  Only the
1603          instructions from innermost loops that benefit from versioning
1604          should be weighed against loop-versioning-max-inner-insns;
1605          everything else should be weighed against
1606          loop-versioning-max-outer-insns.  */
1607       if (!inner_li.worth_versioning_p () || inner->inner)
1608         {
1609           if (dump_enabled_p ())
1610             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1611                              "counting %d instructions from this loop"
1612                              " against its parent loop\n", inner_li.num_insns);
1613           li.num_insns += inner_li.num_insns;
1614         }
1615     }
1616
1617   /* Enforce the size limits.  */
1618   if (li.worth_versioning_p ())
1619     {
1620       unsigned int max_num_insns = max_insns_for_loop (loop);
1621       if (dump_enabled_p ())
1622         dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1623                          "this loop has %d instructions, against"
1624                          " a versioning limit of %d\n",
1625                          li.num_insns, max_num_insns);
1626       if (li.num_insns > max_num_insns)
1627         {
1628           if (dump_enabled_p ())
1629             dump_printf_loc (MSG_MISSED_OPTIMIZATION
1630                              | MSG_PRIORITY_USER_FACING,
1631                              find_loop_location (loop),
1632                              "this loop is too big to version");
1633           return false;
1634         }
1635     }
1636
1637   /* Hoist all version checks from subloops to this loop.  */
1638   for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1639     merge_loop_info (loop, subloop);
1640
1641   return true;
1642 }
1643
1644 /* Decide which loops to version and add them to the versioning queue.
1645    Return true if there are any loops to version.  */
1646
1647 bool
1648 loop_versioning::make_versioning_decisions ()
1649 {
1650   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1651                    dump_user_location_t::from_function_decl (m_fn->decl));
1652
1653   class loop *loop;
1654   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1655     {
1656       loop_info &linfo = get_loop_info (loop);
1657       if (decide_whether_loop_is_versionable (loop))
1658         {
1659           /* Commit to versioning LOOP directly if we can't hoist the
1660              version checks any further.  */
1661           if (linfo.worth_versioning_p ()
1662               && (loop_depth (loop) == 1 || linfo.outermost == loop))
1663             add_loop_to_queue (loop);
1664         }
1665       else
1666         {
1667           /* We can't version this loop, so individually version any
1668              subloops that would benefit and haven't been versioned yet.  */
1669           linfo.rejected_p = true;
1670           for (class loop *subloop = loop->inner; subloop;
1671                subloop = subloop->next)
1672             if (get_loop_info (subloop).worth_versioning_p ())
1673               add_loop_to_queue (subloop);
1674         }
1675     }
1676
1677   return !m_loops_to_version.is_empty ();
1678 }
1679
1680 /* Attempt to implement loop versioning for LOOP, using the information
1681    cached in the associated loop_info.  Return true on success.  */
1682
1683 bool
1684 loop_versioning::version_loop (class loop *loop)
1685 {
1686   loop_info &li = get_loop_info (loop);
1687
1688   /* Build up a condition that selects the original loop instead of
1689      the simplified loop.  */
1690   tree cond = boolean_false_node;
1691   bitmap_iterator bi;
1692   unsigned int i;
1693   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1694     {
1695       tree name = ssa_name (i);
1696       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1697                                  build_one_cst (TREE_TYPE (name)));
1698       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1699     }
1700
1701   /* Convert the condition into a suitable gcond.  */
1702   gimple_seq stmts = NULL;
1703   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1704
1705   /* Version the loop.  */
1706   initialize_original_copy_tables ();
1707   basic_block cond_bb;
1708   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1709                                     profile_probability::unlikely (),
1710                                     profile_probability::likely (),
1711                                     profile_probability::unlikely (),
1712                                     profile_probability::likely (), true);
1713   free_original_copy_tables ();
1714   if (!li.optimized_loop)
1715     {
1716       if (dump_enabled_p ())
1717         dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1718                          "tried but failed to version this loop for when"
1719                          " certain strides are 1\n");
1720       return false;
1721     }
1722
1723   if (dump_enabled_p ())
1724     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1725                      "versioned this loop for when certain strides are 1\n");
1726
1727   /* Insert the statements that feed COND.  */
1728   if (stmts)
1729     {
1730       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1731       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1732     }
1733
1734   return true;
1735 }
1736
1737 /* Attempt to version all loops in the versioning queue.  */
1738
1739 void
1740 loop_versioning::implement_versioning_decisions ()
1741 {
1742   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1743      user-facing at this point.  */
1744
1745   bool any_succeeded_p = false;
1746   class loop *loop;
1747   unsigned int i;
1748   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1749     if (version_loop (loop))
1750       any_succeeded_p = true;
1751   if (!any_succeeded_p)
1752     return;
1753
1754   update_ssa (TODO_update_ssa);
1755
1756   /* Simplify the new loop, which is used when COND is false.  */
1757   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1758     {
1759       loop_info &linfo = get_loop_info (loop);
1760       if (linfo.optimized_loop)
1761         name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1762     }
1763 }
1764
1765 /* Run the pass and return a set of TODO_* flags.  */
1766
1767 unsigned int
1768 loop_versioning::run ()
1769 {
1770   gcc_assert (scev_initialized_p ());
1771
1772   if (analyze_blocks ()
1773       && prune_conditions ()
1774       && make_versioning_decisions ())
1775     implement_versioning_decisions ();
1776
1777   return 0;
1778 }
1779
1780 /* Loop versioning pass.  */
1781
1782 const pass_data pass_data_loop_versioning =
1783 {
1784   GIMPLE_PASS, /* type */
1785   "lversion", /* name */
1786   OPTGROUP_LOOP, /* optinfo_flags */
1787   TV_LOOP_VERSIONING, /* tv_id */
1788   PROP_cfg, /* properties_required */
1789   0, /* properties_provided */
1790   0, /* properties_destroyed */
1791   0, /* todo_flags_start */
1792   0, /* todo_flags_finish */
1793 };
1794
1795 class pass_loop_versioning : public gimple_opt_pass
1796 {
1797 public:
1798   pass_loop_versioning (gcc::context *ctxt)
1799     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1800   {}
1801
1802   /* opt_pass methods: */
1803   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1804   virtual unsigned int execute (function *);
1805 };
1806
1807 unsigned int
1808 pass_loop_versioning::execute (function *fn)
1809 {
1810   if (number_of_loops (fn) <= 1)
1811     return 0;
1812
1813   return loop_versioning (fn).run ();
1814 }
1815
1816 } // anon namespace
1817
1818 gimple_opt_pass *
1819 make_pass_loop_versioning (gcc::context *ctxt)
1820 {
1821   return new pass_loop_versioning (ctxt);
1822 }