gcc/gimple-loop-versioning.cc

   1 /* Loop versioning pass.
   2    Copyright (C) 2018-2019 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "backend.h"
  24 #include "tree.h"
  25 #include "gimple.h"
  26 #include "gimple-iterator.h"
  27 #include "tree-pass.h"
  28 #include "gimplify-me.h"
  29 #include "cfgloop.h"
  30 #include "tree-ssa-loop.h"
  31 #include "ssa.h"
  32 #include "tree-scalar-evolution.h"
  33 #include "tree-chrec.h"
  34 #include "tree-ssa-loop-ivopts.h"
  35 #include "fold-const.h"
  36 #include "tree-ssa-propagate.h"
  37 #include "tree-inline.h"
  38 #include "domwalk.h"
  39 #include "alloc-pool.h"
  40 #include "vr-values.h"
  41 #include "gimple-ssa-evrp-analyze.h"
  42 #include "tree-vectorizer.h"
  43 #include "omp-general.h"
  44 #include "predict.h"
  45 #include "tree-into-ssa.h"
  46 #include "params.h"
  47
  48 namespace {
  49
  50 /* This pass looks for loops that could be simplified if certain loop
  51    invariant conditions were true.  It is effectively a form of loop
  52    splitting in which the pass produces the split conditions itself,
  53    instead of using ones that are already present in the IL.
  54
  55    Versioning for when strides are 1
  56    ---------------------------------
  57
  58    At the moment the only thing the pass looks for are memory references
  59    like:
  60
  61      for (auto i : ...)
  62        ...x[i * stride]...
  63
  64    It considers changing such loops to:
  65
  66      if (stride == 1)
  67        for (auto i : ...)    [A]
  68          ...x[i]...
  69      else
  70        for (auto i : ...)    [B]
  71          ...x[i * stride]...
  72
  73    This can have several benefits:
  74
  75    (1) [A] is often easier or cheaper to vectorize than [B].
  76
  77    (2) The scalar code in [A] is simpler than the scalar code in [B]
  78        (if the loops cannot be vectorized or need an epilogue loop).
  79
  80    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
  81
  82    (4) [A] has simpler address evolutions, which can help other passes
  83        like loop interchange.
  84
  85    The optimization is particularly useful for assumed-shape arrays in
  86    Fortran, where the stride of the innermost dimension depends on the
  87    array descriptor but is often equal to 1 in practice.  For example:
  88
  89      subroutine f1(x)
  90        real :: x(:)
  91        x(:) = 100
  92      end subroutine f1
  93
  94    generates the equivalent of:
  95
  96      raw_stride = *x.dim[0].stride;
  97      stride = raw_stride != 0 ? raw_stride : 1;
  98      x_base = *x.data;
  99      ...
 100      tmp1 = stride * S;
 101      tmp2 = tmp1 - stride;
 102      *x_base[tmp2] = 1.0e+2;
 103
 104    but in the common case that stride == 1, the last three statements
 105    simplify to:
 106
 107      tmp3 = S + -1;
 108      *x_base[tmp3] = 1.0e+2;
 109
 110    The optimization is in principle very simple.  The difficult parts are:
 111
 112    (a) deciding which parts of a general address calculation correspond
 113        to the inner dimension of an array, since this usually isn't explicit
 114        in the IL, and for C often isn't even explicit in the source code
 115
 116    (b) estimating when the transformation is worthwhile
 117
 118    Structure
 119    ---------
 120
 121    The pass has four phases:
 122
 123    (1) Walk through the statements looking for and recording potential
 124        versioning opportunities.  Stop if there are none.
 125
 126    (2) Use context-sensitive range information to see whether any versioning
 127        conditions are impossible in practice.  Remove them if so, and stop
 128        if no opportunities remain.
 129
 130        (We do this only after (1) to keep compile time down when no
 131        versioning opportunities exist.)
 132
 133    (3) Apply the cost model.  Decide which versioning opportunities are
 134        worthwhile and at which nesting level they should be applied.
 135
 136    (4) Attempt to version all the loops selected by (3), so that:
 137
 138          for (...)
 139            ...
 140
 141        becomes:
 142
 143          if (!cond)
 144            for (...) // Original loop
 145              ...
 146          else
 147            for (...) // New loop
 148              ...
 149
 150        Use the version condition COND to simplify the new loop.  */
 151
 152 /* Enumerates the likelihood that a particular value indexes the inner
 153    dimension of an array.  */
 154 enum inner_likelihood {
 155   INNER_UNLIKELY,
 156   INNER_DONT_KNOW,
 157   INNER_LIKELY
 158 };
 159
 160 /* Information about one term of an address_info.  */
 161 struct address_term_info
 162 {
 163   /* The value of the term is EXPR * MULTIPLIER.  */
 164   tree expr;
 165   unsigned HOST_WIDE_INT multiplier;
 166
 167   /* The stride applied by EXPR in each iteration of some unrecorded loop,
 168      or null if no stride has been identified.  */
 169   tree stride;
 170
 171   /* Enumerates the likelihood that EXPR indexes the inner dimension
 172      of an array.  */
 173   enum inner_likelihood inner_likelihood;
 174
 175   /* True if STRIDE == 1 is a versioning opportunity when considered
 176      in isolation.  */
 177   bool versioning_opportunity_p;
 178 };
 179
 180 /* Information about an address calculation, and the range of constant
 181    offsets applied to it.  */
 182 struct address_info
 183 {
 184   static const unsigned int MAX_TERMS = 8;
 185
 186   /* One statement that calculates the address.  If multiple statements
 187      share the same address, we only record the first.  */
 188   gimple *stmt;
 189
 190   /* The loop containing STMT (cached for convenience).  If multiple
 191      statements share the same address, they all belong to this loop.  */
 192   struct loop *loop;
 193
 194   /* A decomposition of the calculation into a sum of terms plus an
 195      optional base.  When BASE is provided, it is never an SSA name.
 196      Once initialization is complete, all members of TERMs are SSA names.  */
 197   tree base;
 198   auto_vec<address_term_info, MAX_TERMS> terms;
 199
 200   /* All bytes accessed from the address fall in the offset range
 201      [MIN_OFFSET, MAX_OFFSET).  */
 202   HOST_WIDE_INT min_offset, max_offset;
 203 };
 204
 205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
 206 struct address_info_hasher : nofree_ptr_hash <address_info>
 207 {
 208   static hashval_t hash (const address_info *);
 209   static bool equal (const address_info *, const address_info *);
 210 };
 211
 212 /* Information about the versioning we'd like to apply to a loop.  */
 213 struct loop_info
 214 {
 215   bool worth_versioning_p () const;
 216
 217   /* True if we've decided not to version this loop.  The remaining
 218      fields are meaningless if so.  */
 219   bool rejected_p;
 220
 221   /* True if at least one subloop of this loop benefits from versioning.  */
 222   bool subloops_benefit_p;
 223
 224   /* An estimate of the total number of instructions in the loop,
 225      excluding those in subloops that benefit from versioning.  */
 226   unsigned int num_insns;
 227
 228   /* The outermost loop that can handle all the version checks
 229      described below.  */
 230   struct loop *outermost;
 231
 232   /* The first entry in the list of blocks that belong to this loop
 233      (and not to subloops).  m_next_block_in_loop provides the chain
 234      pointers for the list.  */
 235   basic_block block_list;
 236
 237   /* We'd like to version the loop for the case in which these SSA names
 238      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
 239   bitmap_head unity_names;
 240
 241   /* If versioning succeeds, this points the version of the loop that
 242      assumes the version conditions holds.  */
 243   struct loop *optimized_loop;
 244 };
 245
 246 /* The main pass structure.  */
 247 class loop_versioning
 248 {
 249 public:
 250   loop_versioning (function *);
 251   ~loop_versioning ();
 252   unsigned int run ();
 253
 254 private:
 255   /* Used to walk the dominator tree to find loop versioning conditions
 256      that are always false.  */
 257   class lv_dom_walker : public dom_walker
 258   {
 259   public:
 260     lv_dom_walker (loop_versioning &);
 261
 262     edge before_dom_children (basic_block) FINAL OVERRIDE;
 263     void after_dom_children (basic_block) FINAL OVERRIDE;
 264
 265   private:
 266     /* The parent pass.  */
 267     loop_versioning &m_lv;
 268
 269     /* Used to build context-dependent range information.  */
 270     evrp_range_analyzer m_range_analyzer;
 271   };
 272
 273   /* Used to simplify statements based on conditions that are established
 274      by the version checks.  */
 275   class name_prop : public substitute_and_fold_engine
 276   {
 277   public:
 278     name_prop (loop_info &li) : m_li (li) {}
 279     tree get_value (tree) FINAL OVERRIDE;
 280
 281   private:
 282     /* Information about the versioning we've performed on the loop.  */
 283     loop_info &m_li;
 284   };
 285
 286   loop_info &get_loop_info (struct loop *loop) { return m_loops[loop->num]; }
 287
 288   unsigned int max_insns_for_loop (struct loop *);
 289   bool expensive_stmt_p (gimple *);
 290
 291   void version_for_unity (gimple *, tree);
 292   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
 293                                 unsigned HOST_WIDE_INT * = 0);
 294   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
 295   bool multiply_term_by (address_term_info &, tree);
 296   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
 297   void analyze_stride (address_info &, address_term_info &,
 298                        tree, struct loop *);
 299   bool find_per_loop_multiplication (address_info &, address_term_info &);
 300   void analyze_term_using_scevs (address_info &, address_term_info &);
 301   void analyze_address_fragment (address_info &);
 302   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
 303                                 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
 304   void analyze_expr (gimple *, tree);
 305   bool analyze_block (basic_block);
 306   bool analyze_blocks ();
 307
 308   void prune_loop_conditions (struct loop *, vr_values *);
 309   bool prune_conditions ();
 310
 311   void merge_loop_info (struct loop *, struct loop *);
 312   void add_loop_to_queue (struct loop *);
 313   bool decide_whether_loop_is_versionable (struct loop *);
 314   bool make_versioning_decisions ();
 315
 316   bool version_loop (struct loop *);
 317   void implement_versioning_decisions ();
 318
 319   /* The function we're optimizing.  */
 320   function *m_fn;
 321
 322   /* The obstack to use for all pass-specific bitmaps.  */
 323   bitmap_obstack m_bitmap_obstack;
 324
 325   /* An obstack to use for general allocation.  */
 326   obstack m_obstack;
 327
 328   /* The number of loops in the function.  */
 329   unsigned int m_nloops;
 330
 331   /* The total number of loop version conditions we've found.  */
 332   unsigned int m_num_conditions;
 333
 334   /* Assume that an address fragment of the form i * stride * scale
 335      (for variable stride and constant scale) will not benefit from
 336      versioning for stride == 1 when scale is greater than this value.  */
 337   unsigned HOST_WIDE_INT m_maximum_scale;
 338
 339   /* Information about each loop.  */
 340   auto_vec<loop_info> m_loops;
 341
 342   /* Used to form a linked list of blocks that belong to a loop,
 343      started by loop_info::block_list.  */
 344   auto_vec<basic_block> m_next_block_in_loop;
 345
 346   /* The list of loops that we've decided to version.  */
 347   auto_vec<struct loop *> m_loops_to_version;
 348
 349   /* A table of addresses in the current loop, keyed off their values
 350      but not their offsets.  */
 351   hash_table <address_info_hasher> m_address_table;
 352
 353   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
 354   auto_vec <address_info *, 32> m_address_list;
 355 };
 356
 357 /* If EXPR is an SSA name and not a default definition, return the
 358    defining statement, otherwise return null.  */
 359
 360 static gimple *
 361 maybe_get_stmt (tree expr)
 362 {
 363   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
 364     return SSA_NAME_DEF_STMT (expr);
 365   return NULL;
 366 }
 367
 368 /* Like maybe_get_stmt, but also return null if the defining
 369    statement isn't an assignment.  */
 370
 371 static gassign *
 372 maybe_get_assign (tree expr)
 373 {
 374   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
 375 }
 376
 377 /* Return true if this pass should look through a cast of expression FROM
 378    to type TYPE when analyzing pieces of an address.  */
 379
 380 static bool
 381 look_through_cast_p (tree type, tree from)
 382 {
 383   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
 384           && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
 385 }
 386
 387 /* Strip all conversions of integers or pointers from EXPR, regardless
 388    of whether the conversions are nops.  This is useful in the context
 389    of this pass because we're not trying to fold or simulate the
 390    expression; we just want to see how it's structured.  */
 391
 392 static tree
 393 strip_casts (tree expr)
 394 {
 395   const unsigned int MAX_NITERS = 4;
 396
 397   tree type = TREE_TYPE (expr);
 398   while (CONVERT_EXPR_P (expr)
 399          && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
 400     expr = TREE_OPERAND (expr, 0);
 401
 402   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
 403     {
 404       gassign *assign = maybe_get_assign (expr);
 405       if (assign
 406           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
 407           && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
 408         expr = gimple_assign_rhs1 (assign);
 409       else
 410         break;
 411     }
 412   return expr;
 413 }
 414
 415 /* Compare two address_term_infos in the same address_info.  */
 416
 417 static int
 418 compare_address_terms (const void *a_uncast, const void *b_uncast)
 419 {
 420   const address_term_info *a = (const address_term_info *) a_uncast;
 421   const address_term_info *b = (const address_term_info *) b_uncast;
 422
 423   if (a->expr != b->expr)
 424     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
 425
 426   if (a->multiplier != b->multiplier)
 427     return a->multiplier < b->multiplier ? -1 : 1;
 428
 429   return 0;
 430 }
 431
 432 /* Dump ADDRESS using flags FLAGS.  */
 433
 434 static void
 435 dump_address_info (dump_flags_t flags, address_info &address)
 436 {
 437   if (address.base)
 438     dump_printf (flags, "%T + ", address.base);
 439   for (unsigned int i = 0; i < address.terms.length (); ++i)
 440     {
 441       if (i != 0)
 442         dump_printf (flags, " + ");
 443       dump_printf (flags, "%T", address.terms[i].expr);
 444       if (address.terms[i].multiplier != 1)
 445         dump_printf (flags, " * %wd", address.terms[i].multiplier);
 446     }
 447   dump_printf (flags, " + [%wd, %wd]",
 448                address.min_offset, address.max_offset - 1);
 449 }
 450
 451 /* Hash an address_info based on its base and terms.  */
 452
 453 hashval_t
 454 address_info_hasher::hash (const address_info *info)
 455 {
 456   inchash::hash hash;
 457   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
 458   hash.add_int (info->terms.length ());
 459   for (unsigned int i = 0; i < info->terms.length (); ++i)
 460     {
 461       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
 462       hash.add_hwi (info->terms[i].multiplier);
 463     }
 464   return hash.end ();
 465 }
 466
 467 /* Return true if two address_infos have equal bases and terms.  Other
 468    properties might be different (such as the statement or constant
 469    offset range).  */
 470
 471 bool
 472 address_info_hasher::equal (const address_info *a, const address_info *b)
 473 {
 474   if (a->base != b->base
 475       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
 476     return false;
 477
 478   if (a->terms.length () != b->terms.length ())
 479     return false;
 480
 481   for (unsigned int i = 0; i < a->terms.length (); ++i)
 482     if (a->terms[i].expr != b->terms[i].expr
 483         || a->terms[i].multiplier != b->terms[i].multiplier)
 484       return false;
 485
 486   return true;
 487 }
 488
 489 /* Return true if we want to version the loop, i.e. if we have a
 490    specific reason for doing so and no specific reason not to.  */
 491
 492 bool
 493 loop_info::worth_versioning_p () const
 494 {
 495   return (!rejected_p
 496           && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
 497 }
 498
 499 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
 500   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
 501 {
 502 }
 503
 504 /* Process BB before processing the blocks it dominates.  */
 505
 506 edge
 507 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 508 {
 509   m_range_analyzer.enter (bb);
 510
 511   if (bb == bb->loop_father->header)
 512     m_lv.prune_loop_conditions (bb->loop_father,
 513                                 m_range_analyzer.get_vr_values ());
 514
 515   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 516        gsi_next (&si))
 517     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
 518
 519   return NULL;
 520 }
 521
 522 /* Process BB after processing the blocks it dominates.  */
 523
 524 void
 525 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
 526 {
 527   m_range_analyzer.leave (bb);
 528 }
 529
 530 /* Decide whether to replace VAL with a new value in a versioned loop.
 531    Return the new value if so, otherwise return null.  */
 532
 533 tree
 534 loop_versioning::name_prop::get_value (tree val)
 535 {
 536   if (TREE_CODE (val) == SSA_NAME
 537       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
 538     return build_one_cst (TREE_TYPE (val));
 539   return NULL_TREE;
 540 }
 541
 542 /* Initialize the structure to optimize FN.  */
 543
 544 loop_versioning::loop_versioning (function *fn)
 545   : m_fn (fn),
 546     m_nloops (number_of_loops (fn)),
 547     m_num_conditions (0),
 548     m_address_table (31)
 549 {
 550   bitmap_obstack_initialize (&m_bitmap_obstack);
 551   gcc_obstack_init (&m_obstack);
 552
 553   /* Initialize the loop information.  */
 554   m_loops.safe_grow_cleared (m_nloops);
 555   for (unsigned int i = 0; i < m_nloops; ++i)
 556     {
 557       m_loops[i].outermost = get_loop (m_fn, 0);
 558       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
 559     }
 560
 561   /* Initialize the list of blocks that belong to each loop.  */
 562   unsigned int nbbs = last_basic_block_for_fn (fn);
 563   m_next_block_in_loop.safe_grow (nbbs);
 564   basic_block bb;
 565   FOR_EACH_BB_FN (bb, fn)
 566     {
 567       loop_info &li = get_loop_info (bb->loop_father);
 568       m_next_block_in_loop[bb->index] = li.block_list;
 569       li.block_list = bb;
 570     }
 571
 572   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
 573      unvectorizable code, since it is the largest size that can be
 574      handled efficiently by scalar code.  omp_max_vf calculates the
 575      maximum number of bytes in a vector, when such a value is relevant
 576      to loop optimization.  */
 577   m_maximum_scale = estimated_poly_value (omp_max_vf ());
 578   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 579 }
 580
 581 loop_versioning::~loop_versioning ()
 582 {
 583   bitmap_obstack_release (&m_bitmap_obstack);
 584   obstack_free (&m_obstack, NULL);
 585 }
 586
 587 /* Return the maximum number of instructions allowed in LOOP before
 588    it becomes too big for versioning.
 589
 590    There are separate limits for inner and outer loops.  The limit for
 591    inner loops applies only to loops that benefit directly from versioning.
 592    The limit for outer loops applies to all code in the outer loop and
 593    its subloops that *doesn't* benefit directly from versioning; such code
 594    would be "taken along for the ride".  The idea is that if the cost of
 595    the latter is small, it is better to version outer loops rather than
 596    inner loops, both to reduce the number of repeated checks and to enable
 597    more of the loop nest to be optimized as a natural nest (e.g. by loop
 598    interchange or outer-loop vectorization).  */
 599
 600 unsigned int
 601 loop_versioning::max_insns_for_loop (struct loop *loop)
 602 {
 603   return (loop->inner
 604           ? PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_OUTER_INSNS)
 605           : PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_INNER_INSNS));
 606 }
 607
 608 /* Return true if for cost reasons we should avoid versioning any loop
 609    that contains STMT.
 610
 611    Note that we don't need to check whether versioning is invalid for
 612    correctness reasons, since the versioning process does that for us.
 613    The conditions involved are too rare to be worth duplicating here.  */
 614
 615 bool
 616 loop_versioning::expensive_stmt_p (gimple *stmt)
 617 {
 618   if (gcall *call = dyn_cast <gcall *> (stmt))
 619     /* Assume for now that the time spent in an "expensive" call would
 620        overwhelm any saving from versioning.  */
 621     return !gimple_inexpensive_call_p (call);
 622   return false;
 623 }
 624
 625 /* Record that we want to version the loop that contains STMT for the
 626    case in which SSA name NAME is equal to 1.  We already know that NAME
 627    is invariant in the loop.  */
 628
 629 void
 630 loop_versioning::version_for_unity (gimple *stmt, tree name)
 631 {
 632   struct loop *loop = loop_containing_stmt (stmt);
 633   loop_info &li = get_loop_info (loop);
 634
 635   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
 636     {
 637       /* This is the first time we've wanted to version LOOP for NAME.
 638          Keep track of the outermost loop that can handle all versioning
 639          checks in LI.  */
 640       struct loop *outermost
 641         = outermost_invariant_loop_for_expr (loop, name);
 642       if (loop_depth (li.outermost) < loop_depth (outermost))
 643         li.outermost = outermost;
 644
 645       if (dump_enabled_p ())
 646         {
 647           dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
 648                            " for when %T == 1", name);
 649           if (outermost == loop)
 650             dump_printf (MSG_NOTE, "; cannot hoist check further");
 651           else
 652             {
 653               dump_printf (MSG_NOTE, "; could implement the check at loop"
 654                            " depth %d", loop_depth (outermost));
 655               if (loop_depth (li.outermost) > loop_depth (outermost))
 656                 dump_printf (MSG_NOTE, ", but other checks only allow"
 657                              " a depth of %d", loop_depth (li.outermost));
 658             }
 659           dump_printf (MSG_NOTE, "\n");
 660         }
 661
 662       m_num_conditions += 1;
 663     }
 664   else
 665     {
 666       /* This is a duplicate request.  */
 667       if (dump_enabled_p ())
 668         dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
 669                          " loop for when %T == 1\n", name);
 670     }
 671 }
 672
 673 /* Return true if OP1_TREE is constant and if in principle it is worth
 674    versioning an address fragment of the form:
 675
 676      i * OP1_TREE * OP2 * stride
 677
 678    for the case in which stride == 1.  This in practice means testing
 679    whether:
 680
 681      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
 682
 683    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
 684
 685 bool
 686 loop_versioning::acceptable_multiplier_p (tree op1_tree,
 687                                           unsigned HOST_WIDE_INT op2,
 688                                           unsigned HOST_WIDE_INT *result)
 689 {
 690   if (tree_fits_uhwi_p (op1_tree))
 691     {
 692       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
 693       /* The first part checks for overflow.  */
 694       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
 695         {
 696           if (result)
 697             *result = op1 * op2;
 698           return true;
 699         }
 700     }
 701   return false;
 702 }
 703
 704 /* Return true if it is worth using loop versioning on a memory access
 705    of type TYPE.  Store the size of the access in *SIZE if so.  */
 706
 707 bool
 708 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
 709 {
 710   return (TYPE_SIZE_UNIT (type)
 711           && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
 712 }
 713
 714 /* See whether OP is constant and whether we can multiply TERM by that
 715    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
 716    TERM if so.  */
 717
 718 bool
 719 loop_versioning::multiply_term_by (address_term_info &term, tree op)
 720 {
 721   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
 722 }
 723
 724 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
 725    is likely to be indexing an innermost dimension, returning the result
 726    as an INNER_* probability.  */
 727
 728 inner_likelihood
 729 loop_versioning::get_inner_likelihood (tree stride,
 730                                        unsigned HOST_WIDE_INT multiplier)
 731 {
 732   const unsigned int MAX_NITERS = 8;
 733
 734   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
 735      least one of those values is likely to be for the innermost dimension.
 736      Record in UNLIKELY_P if at least one of those values is unlikely to be
 737      for the innermost dimension.
 738
 739      E.g. for:
 740
 741        stride = cond ? a * b : 1
 742
 743      we should treat STRIDE as being a likely inner dimension, since
 744      we know that it is 1 under at least some circumstances.  (See the
 745      Fortran example below.)  However:
 746
 747        stride = a * b
 748
 749      on its own is unlikely to be for the innermost dimension, since
 750      that would require both a and b to be 1 at runtime.  */
 751   bool unlikely_p = false;
 752   tree worklist[MAX_NITERS];
 753   unsigned int length = 0;
 754   worklist[length++] = stride;
 755   for (unsigned int i = 0; i < length; ++i)
 756     {
 757       tree expr = worklist[i];
 758
 759       if (CONSTANT_CLASS_P (expr))
 760         {
 761           /* See if EXPR * MULTIPLIER would be consistent with an individual
 762              access or a small grouped access.  */
 763           if (acceptable_multiplier_p (expr, multiplier))
 764             return INNER_LIKELY;
 765           else
 766             unlikely_p = true;
 767         }
 768       else if (gimple *stmt = maybe_get_stmt (expr))
 769         {
 770           /* If EXPR is set by a PHI node, queue its arguments in case
 771              we find one that is consistent with an inner dimension.
 772
 773              An important instance of this is the Fortran handling of array
 774              descriptors, which calculates the stride of the inner dimension
 775              using a PHI equivalent of:
 776
 777                 raw_stride = a.dim[0].stride;
 778                 stride = raw_stride != 0 ? raw_stride : 1;
 779
 780              (Strides for outer dimensions do not treat 0 specially.)  */
 781           if (gphi *phi = dyn_cast <gphi *> (stmt))
 782             {
 783               unsigned int nargs = gimple_phi_num_args (phi);
 784               for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
 785                 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
 786             }
 787           /* If the value is set by an assignment, expect it to be read
 788              from memory (such as an array descriptor) rather than be
 789              calculated.  */
 790           else if (gassign *assign = dyn_cast <gassign *> (stmt))
 791             {
 792               if (!gimple_assign_load_p (assign))
 793                 unlikely_p = true;
 794             }
 795           /* Things like calls don't really tell us anything.  */
 796         }
 797     }
 798
 799   /* We didn't find any possible values of STRIDE that were likely to be
 800      for the innermost dimension.  If we found one that was actively
 801      unlikely to be for the innermost dimension, assume that that applies
 802      to STRIDE too.  */
 803   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
 804 }
 805
 806 /* The caller has identified that STRIDE is the stride of interest
 807    in TERM, and that the stride is applied in OP_LOOP.  Record this
 808    information in TERM, deciding whether STRIDE is likely to be for
 809    the innermost dimension of an array and whether it represents a
 810    versioning opportunity.  ADDRESS is the address that contains TERM.  */
 811
 812 void
 813 loop_versioning::analyze_stride (address_info &address,
 814                                  address_term_info &term,
 815                                  tree stride, struct loop *op_loop)
 816 {
 817   term.stride = stride;
 818
 819   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
 820   if (dump_enabled_p ())
 821     {
 822       if (term.inner_likelihood == INNER_LIKELY)
 823         dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
 824                          " innermost dimension\n", stride);
 825       else if (term.inner_likelihood == INNER_UNLIKELY)
 826         dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
 827                          " innermost dimension\n", stride);
 828       else
 829         dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
 830                          " is the innermost dimension\n", stride);
 831     }
 832
 833   /* To be a versioning opportunity we require:
 834
 835      - The multiplier applied by TERM is equal to the access size,
 836        so that when STRIDE is 1, the accesses in successive loop
 837        iterations are consecutive.
 838
 839        This is deliberately conservative.  We could relax it to handle
 840        other cases (such as those with gaps between iterations) if we
 841        find any real testcases for which it's useful.
 842
 843      - the stride is applied in the same loop as STMT rather than
 844        in an outer loop.  Although versioning for strides applied in
 845        outer loops could help in some cases -- such as enabling
 846        more loop interchange -- the savings are much lower than for
 847        inner loops.
 848
 849      - the stride is an SSA name that is invariant in STMT's loop,
 850        since otherwise versioning isn't possible.  */
 851   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
 852   if (term.multiplier == access_size
 853       && address.loop == op_loop
 854       && TREE_CODE (stride) == SSA_NAME
 855       && expr_invariant_in_loop_p (address.loop, stride))
 856     {
 857       term.versioning_opportunity_p = true;
 858       if (dump_enabled_p ())
 859         dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
 860                          " opportunity\n", stride);
 861     }
 862 }
 863
 864 /* See whether address term TERM (which belongs to ADDRESS) is the result
 865    of multiplying a varying SSA name by a loop-invariant SSA name.
 866    Return true and update TERM if so.
 867
 868    This handles both cases that SCEV might handle, such as:
 869
 870      for (int i = 0; i < n; ++i)
 871        res += a[i * stride];
 872
 873    and ones in which the term varies arbitrarily between iterations, such as:
 874
 875      for (int i = 0; i < n; ++i)
 876        res += a[index[i] * stride];  */
 877
 878 bool
 879 loop_versioning::find_per_loop_multiplication (address_info &address,
 880                                                address_term_info &term)
 881 {
 882   gimple *mult = maybe_get_assign (term.expr);
 883   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
 884     return false;
 885
 886   struct loop *mult_loop = loop_containing_stmt (mult);
 887   if (!loop_outer (mult_loop))
 888     return false;
 889
 890   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
 891   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
 892   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
 893     return false;
 894
 895   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
 896   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
 897   if (invariant1_p == invariant2_p)
 898     return false;
 899
 900   /* Make sure that the loop invariant is OP2 rather than OP1.  */
 901   if (invariant1_p)
 902     std::swap (op1, op2);
 903
 904   if (dump_enabled_p ())
 905     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
 906                      " * loop-invariant %T\n", term.expr, op1, op2);
 907   analyze_stride (address, term, op2, mult_loop);
 908   return true;
 909 }
 910
 911 /* Try to use scalar evolutions to find an address stride for TERM,
 912    which belongs to ADDRESS.
 913
 914    Here we are interested in any evolution information we can find,
 915    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
 916    an outer loop obviously iterates over the inner dimension of an array,
 917    that information can help us eliminate worthless versioning opportunities
 918    in inner loops.  */
 919
 920 void
 921 loop_versioning::analyze_term_using_scevs (address_info &address,
 922                                            address_term_info &term)
 923 {
 924   gimple *setter = maybe_get_stmt (term.expr);
 925   if (!setter)
 926     return;
 927
 928   struct loop *wrt_loop = loop_containing_stmt (setter);
 929   if (!loop_outer (wrt_loop))
 930     return;
 931
 932   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
 933   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
 934     {
 935       if (dump_enabled_p ())
 936         dump_printf_loc (MSG_NOTE, address.stmt,
 937                          "address term %T = %T\n", term.expr, chrec);
 938
 939       /* Peel casts and accumulate constant multiplications, up to the
 940          limit allowed by M_MAXIMUM_SCALE.  */
 941       tree stride = strip_casts (CHREC_RIGHT (chrec));
 942       while (TREE_CODE (stride) == MULT_EXPR
 943              && multiply_term_by (term, TREE_OPERAND (stride, 1)))
 944         stride = strip_casts (TREE_OPERAND (stride, 0));
 945
 946       gassign *assign;
 947       while ((assign = maybe_get_assign (stride))
 948              && gimple_assign_rhs_code (assign) == MULT_EXPR
 949              && multiply_term_by (term, gimple_assign_rhs2 (assign)))
 950         {
 951           if (dump_enabled_p ())
 952             dump_printf_loc (MSG_NOTE, address.stmt,
 953                              "looking through %G", assign);
 954           stride = strip_casts (gimple_assign_rhs1 (assign));
 955         }
 956
 957       analyze_stride (address, term, stride, get_chrec_loop (chrec));
 958     }
 959 }
 960
 961 /* Try to identify loop strides in ADDRESS and try to choose realistic
 962    versioning opportunities based on these strides.
 963
 964    The main difficulty here isn't finding strides that could be used
 965    in a version check (that's pretty easy).  The problem instead is to
 966    avoid versioning for some stride S that is unlikely ever to be 1 at
 967    runtime.  Versioning for S == 1 on its own would lead to unnecessary
 968    code bloat, while adding S == 1 to more realistic version conditions
 969    would lose the optimisation opportunity offered by those other conditions.
 970
 971    For example, versioning for a stride of 1 in the Fortran code:
 972
 973      integer :: a(:,:)
 974      a(1,:) = 1
 975
 976    is not usually a good idea, since the assignment is iterating over
 977    an outer dimension and is relatively unlikely to have a stride of 1.
 978    (It isn't impossible, since the inner dimension might be 1, or the
 979    array might be transposed.)  Similarly, in:
 980
 981      integer :: a(:,:), b(:,:)
 982      b(:,1) = a(1,:)
 983
 984    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
 985    Versioning for when both strides are 1 would lose most of the benefit
 986    of versioning for b's access.
 987
 988    The approach we take is as follows:
 989
 990    - Analyze each term to see whether it has an identifiable stride,
 991      regardless of which loop applies the stride.
 992
 993    - Evaluate the likelihood that each such stride is for the innermost
 994      dimension of an array, on the scale "likely", "don't know" or "unlikely".
 995
 996    - If there is a single "likely" innermost stride, and that stride is
 997      applied in the loop that contains STMT, version the loop for when the
 998      stride is 1.  This deals with the cases in which we're fairly
 999      confident of doing the right thing, such as the b(:,1) reference above.
1000
1001    - If there are no "likely" innermost strides, and the loop that contains
1002      STMT uses a stride that we rated as "don't know", version for when
1003      that stride is 1.  This is principally used for C code such as:
1004
1005        for (int i = 0; i < n; ++i)
1006          a[i * x] = ...;
1007
1008      and:
1009
1010        for (int j = 0; j < n; ++j)
1011          for (int i = 0; i < n; ++i)
1012            a[i * x + j * y] = ...;
1013
1014      where nothing in the way "x" and "y" are set gives a hint as to
1015      whether "i" iterates over the innermost dimension of the array.
1016      In these situations it seems reasonable to assume the the
1017      programmer has nested the loops appropriately (although of course
1018      there are examples like GEMM in which this assumption doesn't hold
1019      for all accesses in the loop).
1020
1021      This case is also useful for the Fortran equivalent of the
1022      above C code.  */
1023
1024 void
1025 loop_versioning::analyze_address_fragment (address_info &address)
1026 {
1027   if (dump_enabled_p ())
1028     {
1029       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1030       dump_address_info (MSG_NOTE, address);
1031       dump_printf (MSG_NOTE, "\n");
1032     }
1033
1034   /* Analyze each component of the sum to see whether it involves an
1035      apparent stride.
1036
1037      There is an overlap between the addresses that
1038      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1039      but the former is much cheaper than SCEV analysis, so try it first.  */
1040   for (unsigned int i = 0; i < address.terms.length (); ++i)
1041     if (!find_per_loop_multiplication (address, address.terms[i]))
1042       analyze_term_using_scevs (address, address.terms[i]);
1043
1044   /* Check for strides that are likely to be for the innermost dimension.
1045
1046      1. If there is a single likely inner stride, if it is an SSA name,
1047         and if it is worth versioning the loop for when the SSA name
1048         equals 1, record that we want to do so.
1049
1050      2. Otherwise, if there any likely inner strides, bail out.  This means
1051         one of:
1052
1053         (a) There are multiple likely inner strides.  This suggests we're
1054             confused and be can't be confident of doing the right thing.
1055
1056         (b) There is a single likely inner stride and it is a constant
1057             rather than an SSA name.  This can mean either that the access
1058             is a natural one without any variable strides, such as:
1059
1060               for (int i = 0; i < n; ++i)
1061                 a[i] += 1;
1062
1063             or that a variable stride is applied to an outer dimension,
1064             such as:
1065
1066               for (int i = 0; i < n; ++i)
1067                 for (int j = 0; j < n; ++j)
1068                   a[j * stride][i] += 1;
1069
1070         (c) There is a single likely inner stride, and it is an SSA name,
1071             but it isn't a worthwhile versioning opportunity.  This usually
1072             means that the variable stride is applied by an outer loop,
1073             such as:
1074
1075               for (int i = 0; i < n; ++i)
1076                 for (int j = 0; j < n; ++j)
1077                   a[j][i * stride] += 1;
1078
1079             or (using an example with a more natural loop nesting):
1080
1081               for (int i = 0; i < n; ++i)
1082                 for (int j = 0; j < n; ++j)
1083                   a[i][j] += b[i * stride];
1084
1085             in cases where b[i * stride] cannot (yet) be hoisted for
1086             aliasing reasons.
1087
1088      3. If there are no likely inner strides, fall through to the next
1089         set of checks.
1090
1091      Pointer equality is enough to check for uniqueness in (1), since we
1092      only care about SSA names.  */
1093   tree chosen_stride = NULL_TREE;
1094   tree version_stride = NULL_TREE;
1095   for (unsigned int i = 0; i < address.terms.length (); ++i)
1096     if (chosen_stride != address.terms[i].stride
1097         && address.terms[i].inner_likelihood == INNER_LIKELY)
1098       {
1099         if (chosen_stride)
1100           return;
1101         chosen_stride = address.terms[i].stride;
1102         if (address.terms[i].versioning_opportunity_p)
1103           version_stride = chosen_stride;
1104       }
1105
1106   /* If there are no likely inner strides, see if there is a single
1107      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1108      See the comment above the function for the cases that this code
1109      handles.  */
1110   if (!chosen_stride)
1111     for (unsigned int i = 0; i < address.terms.length (); ++i)
1112       if (version_stride != address.terms[i].stride
1113           && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1114           && address.terms[i].versioning_opportunity_p)
1115         {
1116           if (version_stride)
1117             return;
1118           version_stride = address.terms[i].stride;
1119         }
1120
1121   if (version_stride)
1122     version_for_unity (address.stmt, version_stride);
1123 }
1124
1125 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1126    TYPE_SIZE bytes and record this address fragment for later processing.
1127    STMT is the statement that contains the address.  */
1128
1129 void
1130 loop_versioning::record_address_fragment (gimple *stmt,
1131                                           unsigned HOST_WIDE_INT type_size,
1132                                           tree expr,
1133                                           unsigned HOST_WIDE_INT multiplier,
1134                                           HOST_WIDE_INT offset)
1135 {
1136   /* We're only interested in computed values.  */
1137   if (TREE_CODE (expr) != SSA_NAME)
1138     return;
1139
1140   /* Quick exit if no part of the address is calculated in STMT's loop,
1141      since such addresses have no versioning opportunities.  */
1142   struct loop *loop = loop_containing_stmt (stmt);
1143   if (expr_invariant_in_loop_p (loop, expr))
1144     return;
1145
1146   /* Set up an address_info for EXPR * MULTIPLIER.  */
1147   address_info *address = XOBNEW (&m_obstack, address_info);
1148   new (address) address_info;
1149   address->stmt = stmt;
1150   address->loop = loop;
1151   address->base = NULL_TREE;
1152   address->terms.quick_grow (1);
1153   address->terms[0].expr = expr;
1154   address->terms[0].multiplier = multiplier;
1155   address->terms[0].stride = NULL_TREE;
1156   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1157   address->terms[0].versioning_opportunity_p = false;
1158   address->min_offset = offset;
1159
1160   /* Peel apart the expression into a sum of address_terms, where each
1161      term is multiplied by a constant.  Treat a + b and a - b the same,
1162      since it doesn't matter for our purposes whether an address is
1163      increasing or decreasing.  Distribute (a + b) * constant into
1164      a * constant + b * constant.
1165
1166      We don't care which loop each term belongs to, since we want to
1167      examine as many candidate strides as possible when determining
1168      which is likely to be for the innermost dimension.  We therefore
1169      don't limit the search to statements in STMT's loop.  */
1170   for (unsigned int i = 0; i < address->terms.length (); )
1171     {
1172       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1173         {
1174           tree_code code = gimple_assign_rhs_code (assign);
1175           if (code == PLUS_EXPR
1176               || code == POINTER_PLUS_EXPR
1177               || code == MINUS_EXPR)
1178             {
1179               tree op1 = gimple_assign_rhs1 (assign);
1180               tree op2 = gimple_assign_rhs2 (assign);
1181               if (TREE_CODE (op2) == INTEGER_CST)
1182                 {
1183                   address->terms[i].expr = strip_casts (op1);
1184                   /* This is heuristic only, so don't worry about truncation
1185                      or overflow.  */
1186                   address->min_offset += (TREE_INT_CST_LOW (op2)
1187                                           * address->terms[i].multiplier);
1188                   continue;
1189                 }
1190               else if (address->terms.length () < address_info::MAX_TERMS)
1191                 {
1192                   unsigned int j = address->terms.length ();
1193                   address->terms.quick_push (address->terms[i]);
1194                   address->terms[i].expr = strip_casts (op1);
1195                   address->terms[j].expr = strip_casts (op2);
1196                   continue;
1197                 }
1198             }
1199           if (code == MULT_EXPR)
1200             {
1201               tree op1 = gimple_assign_rhs1 (assign);
1202               tree op2 = gimple_assign_rhs2 (assign);
1203               if (multiply_term_by (address->terms[i], op2))
1204                 {
1205                   address->terms[i].expr = strip_casts (op1);
1206                   continue;
1207                 }
1208             }
1209         }
1210       i += 1;
1211     }
1212
1213   /* Peel off any symbolic pointer.  */
1214   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1215       && address->terms[0].multiplier == 1)
1216     {
1217       if (address->terms.length () == 1)
1218         {
1219           obstack_free (&m_obstack, address);
1220           return;
1221         }
1222       address->base = address->terms[0].expr;
1223       address->terms.ordered_remove (0);
1224     }
1225
1226   /* Require all remaining terms to be SSA names.  (This could be false
1227      for unfolded statements, but they aren't worth dealing with.)  */
1228   for (unsigned int i = 0; i < address->terms.length (); ++i)
1229     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1230       {
1231         obstack_free (&m_obstack, address);
1232         return;
1233       }
1234
1235   /* The loop above set MIN_OFFSET based on the first byte of the
1236      referenced data.  Calculate the end + 1.  */
1237   address->max_offset = address->min_offset + type_size;
1238
1239   /* Put the terms into a canonical order for the hash table lookup below.  */
1240   address->terms.qsort (compare_address_terms);
1241
1242   if (dump_enabled_p ())
1243     {
1244       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1245       if (multiplier != 1)
1246         dump_printf (MSG_NOTE, " * %wd", multiplier);
1247       dump_printf (MSG_NOTE, " = ");
1248       dump_address_info (MSG_NOTE, *address);
1249       dump_printf (MSG_NOTE, "\n");
1250     }
1251
1252   /* Pool address information with the same terms (but potentially
1253      different offsets).  */
1254   address_info **slot = m_address_table.find_slot (address, INSERT);
1255   if (address_info *old_address = *slot)
1256     {
1257       /* We've already seen an address with the same terms.  Extend the
1258          offset range to account for this access.  Doing this can paper
1259          over gaps, such as in:
1260
1261            a[i * stride * 4] + a[i * stride * 4 + 3];
1262
1263          where nothing references "+ 1" or "+ 2".  However, the vectorizer
1264          handles such gapped accesses without problems, so it's not worth
1265          trying to exclude them.  */
1266       if (old_address->min_offset > address->min_offset)
1267         old_address->min_offset = address->min_offset;
1268       if (old_address->max_offset < address->max_offset)
1269         old_address->max_offset = address->max_offset;
1270       obstack_free (&m_obstack, address);
1271     }
1272   else
1273     {
1274       /* This is the first time we've seen an address with these terms.  */
1275       *slot = address;
1276       m_address_list.safe_push (address);
1277     }
1278 }
1279
1280 /* Analyze expression EXPR, which occurs in STMT.  */
1281
1282 void
1283 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1284 {
1285   unsigned HOST_WIDE_INT type_size;
1286
1287   while (handled_component_p (expr))
1288     {
1289       /* See whether we can use versioning to avoid a multiplication
1290          in an array index.  */
1291       if (TREE_CODE (expr) == ARRAY_REF
1292           && acceptable_type_p (TREE_TYPE (expr), &type_size))
1293         record_address_fragment (stmt, type_size,
1294                                  TREE_OPERAND (expr, 1), type_size, 0);
1295       expr = TREE_OPERAND (expr, 0);
1296     }
1297
1298   /* See whether we can use versioning to avoid a multiplication
1299      in the pointer calculation of a MEM_REF.  */
1300   if (TREE_CODE (expr) == MEM_REF
1301       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1302     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1303                              /* This is heuristic only, so don't worry
1304                                 about truncation or overflow.  */
1305                              TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1306
1307   /* These would be easy to handle if they existed at this stage.  */
1308   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1309 }
1310
1311 /* Analyze all the statements in BB looking for useful version checks.
1312    Return true on success, false if something prevents the block from
1313    being versioned.  */
1314
1315 bool
1316 loop_versioning::analyze_block (basic_block bb)
1317 {
1318   struct loop *loop = bb->loop_father;
1319   loop_info &li = get_loop_info (loop);
1320   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1321        gsi_next (&gsi))
1322     {
1323       gimple *stmt = gsi_stmt (gsi);
1324       if (is_gimple_debug (stmt))
1325         continue;
1326
1327       if (expensive_stmt_p (stmt))
1328         {
1329           if (dump_enabled_p ())
1330             dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1331                              " prevents versioning: %G", stmt);
1332           return false;
1333         }
1334
1335       /* Only look for direct versioning opportunities in inner loops
1336          since the benefit tends to be much smaller for outer loops.  */
1337       if (!loop->inner)
1338         {
1339           unsigned int nops = gimple_num_ops (stmt);
1340           for (unsigned int i = 0; i < nops; ++i)
1341             if (tree op = gimple_op (stmt, i))
1342               analyze_expr (stmt, op);
1343         }
1344
1345       /* The point of the instruction limit is to prevent excessive
1346          code growth, so this is a size-based estimate even though
1347          the optimization is aimed at speed.  */
1348       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1349     }
1350
1351   return true;
1352 }
1353
1354 /* Analyze all the blocks in the function, looking for useful version checks.
1355    Return true if we found one.  */
1356
1357 bool
1358 loop_versioning::analyze_blocks ()
1359 {
1360   AUTO_DUMP_SCOPE ("analyze_blocks",
1361                    dump_user_location_t::from_function_decl (m_fn->decl));
1362
1363   /* For now we don't try to version the whole function, although
1364      versioning at that level could be useful in some cases.  */
1365   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1366
1367   struct loop *loop;
1368   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1369     {
1370       loop_info &linfo = get_loop_info (loop);
1371
1372       /* Ignore cold loops.  */
1373       if (!optimize_loop_for_speed_p (loop))
1374         linfo.rejected_p = true;
1375
1376       /* See whether an inner loop prevents versioning of this loop.  */
1377       if (!linfo.rejected_p)
1378         for (struct loop *inner = loop->inner; inner; inner = inner->next)
1379           if (get_loop_info (inner).rejected_p)
1380             {
1381               linfo.rejected_p = true;
1382               break;
1383             }
1384
1385       /* If versioning the loop is still a possibility, examine the
1386          statements in the loop to look for versioning opportunities.  */
1387       if (!linfo.rejected_p)
1388         {
1389           void *start_point = obstack_alloc (&m_obstack, 0);
1390
1391           for (basic_block bb = linfo.block_list; bb;
1392                bb = m_next_block_in_loop[bb->index])
1393             if (!analyze_block (bb))
1394               {
1395                 linfo.rejected_p = true;
1396                 break;
1397             }
1398
1399           if (!linfo.rejected_p)
1400             {
1401               /* Process any queued address fragments, now that we have
1402                  complete grouping information.  */
1403               address_info *address;
1404               unsigned int i;
1405               FOR_EACH_VEC_ELT (m_address_list, i, address)
1406                 analyze_address_fragment (*address);
1407             }
1408
1409           m_address_table.empty ();
1410           m_address_list.truncate (0);
1411           obstack_free (&m_obstack, start_point);
1412         }
1413     }
1414
1415   return m_num_conditions != 0;
1416 }
1417
1418 /* Use the ranges in VRS to remove impossible versioning conditions from
1419    LOOP.  */
1420
1421 void
1422 loop_versioning::prune_loop_conditions (struct loop *loop, vr_values *vrs)
1423 {
1424   loop_info &li = get_loop_info (loop);
1425
1426   int to_remove = -1;
1427   bitmap_iterator bi;
1428   unsigned int i;
1429   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1430     {
1431       tree name = ssa_name (i);
1432       value_range *vr = vrs->get_value_range (name);
1433       if (vr && !range_includes_p (vr, 1))
1434         {
1435           if (dump_enabled_p ())
1436             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1437                              "%T can never be 1 in this loop\n", name);
1438
1439           if (to_remove >= 0)
1440             bitmap_clear_bit (&li.unity_names, to_remove);
1441           to_remove = i;
1442           m_num_conditions -= 1;
1443         }
1444     }
1445   if (to_remove >= 0)
1446     bitmap_clear_bit (&li.unity_names, to_remove);
1447 }
1448
1449 /* Remove any scheduled loop version conditions that will never be true.
1450    Return true if any remain.  */
1451
1452 bool
1453 loop_versioning::prune_conditions ()
1454 {
1455   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1456                    dump_user_location_t::from_function_decl (m_fn->decl));
1457
1458   calculate_dominance_info (CDI_DOMINATORS);
1459   lv_dom_walker dom_walker (*this);
1460   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1461   return m_num_conditions != 0;
1462 }
1463
1464 /* Merge the version checks for INNER into immediately-enclosing loop
1465    OUTER.  */
1466
1467 void
1468 loop_versioning::merge_loop_info (struct loop *outer, struct loop *inner)
1469 {
1470   loop_info &inner_li = get_loop_info (inner);
1471   loop_info &outer_li = get_loop_info (outer);
1472
1473   if (dump_enabled_p ())
1474     {
1475       bitmap_iterator bi;
1476       unsigned int i;
1477       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1478         if (!bitmap_bit_p (&outer_li.unity_names, i))
1479           dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1480                            "hoisting check that %T == 1 to outer loop\n",
1481                            ssa_name (i));
1482     }
1483
1484   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1485   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1486     outer_li.outermost = inner_li.outermost;
1487 }
1488
1489 /* Add LOOP to the queue of loops to version.  */
1490
1491 void
1492 loop_versioning::add_loop_to_queue (struct loop *loop)
1493 {
1494   loop_info &li = get_loop_info (loop);
1495
1496   if (dump_enabled_p ())
1497     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1498                      "queuing this loop for versioning\n");
1499   m_loops_to_version.safe_push (loop);
1500
1501   /* Don't try to version superloops.  */
1502   li.rejected_p = true;
1503 }
1504
1505 /* Decide whether the cost model would allow us to version LOOP,
1506    either directly or as part of a parent loop, and return true if so.
1507    This does not imply that the loop is actually worth versioning in its
1508    own right, just that it would be valid to version it if something
1509    benefited.
1510
1511    We have already made this decision for all inner loops of LOOP.  */
1512
1513 bool
1514 loop_versioning::decide_whether_loop_is_versionable (struct loop *loop)
1515 {
1516   loop_info &li = get_loop_info (loop);
1517
1518   if (li.rejected_p)
1519     return false;
1520
1521   /* Examine the decisions made for inner loops.  */
1522   for (struct loop *inner = loop->inner; inner; inner = inner->next)
1523     {
1524       loop_info &inner_li = get_loop_info (inner);
1525       if (inner_li.rejected_p)
1526         {
1527           if (dump_enabled_p ())
1528             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1529                              "not versioning this loop because one of its"
1530                              " inner loops should not be versioned\n");
1531           return false;
1532         }
1533
1534       if (inner_li.worth_versioning_p ())
1535         li.subloops_benefit_p = true;
1536
1537       /* Accumulate the number of instructions from subloops that are not
1538          the innermost, or that don't benefit from versioning.  Only the
1539          instructions from innermost loops that benefit from versioning
1540          should be weighed against loop-versioning-max-inner-insns;
1541          everything else should be weighed against
1542          loop-versioning-max-outer-insns.  */
1543       if (!inner_li.worth_versioning_p () || inner->inner)
1544         {
1545           if (dump_enabled_p ())
1546             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1547                              "counting %d instructions from this loop"
1548                              " against its parent loop\n", inner_li.num_insns);
1549           li.num_insns += inner_li.num_insns;
1550         }
1551     }
1552
1553   /* Enforce the size limits.  */
1554   if (li.worth_versioning_p ())
1555     {
1556       unsigned int max_num_insns = max_insns_for_loop (loop);
1557       if (dump_enabled_p ())
1558         dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1559                          "this loop has %d instructions, against"
1560                          " a versioning limit of %d\n",
1561                          li.num_insns, max_num_insns);
1562       if (li.num_insns > max_num_insns)
1563         {
1564           if (dump_enabled_p ())
1565             dump_printf_loc (MSG_MISSED_OPTIMIZATION
1566                              | MSG_PRIORITY_USER_FACING,
1567                              find_loop_location (loop),
1568                              "this loop is too big to version");
1569           return false;
1570         }
1571     }
1572
1573   /* Hoist all version checks from subloops to this loop.  */
1574   for (struct loop *subloop = loop->inner; subloop; subloop = subloop->next)
1575     merge_loop_info (loop, subloop);
1576
1577   return true;
1578 }
1579
1580 /* Decide which loops to version and add them to the versioning queue.
1581    Return true if there are any loops to version.  */
1582
1583 bool
1584 loop_versioning::make_versioning_decisions ()
1585 {
1586   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1587                    dump_user_location_t::from_function_decl (m_fn->decl));
1588
1589   struct loop *loop;
1590   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1591     {
1592       loop_info &linfo = get_loop_info (loop);
1593       if (decide_whether_loop_is_versionable (loop))
1594         {
1595           /* Commit to versioning LOOP directly if we can't hoist the
1596              version checks any further.  */
1597           if (linfo.worth_versioning_p ()
1598               && (loop_depth (loop) == 1 || linfo.outermost == loop))
1599             add_loop_to_queue (loop);
1600         }
1601       else
1602         {
1603           /* We can't version this loop, so individually version any
1604              subloops that would benefit and haven't been versioned yet.  */
1605           linfo.rejected_p = true;
1606           for (struct loop *subloop = loop->inner; subloop;
1607                subloop = subloop->next)
1608             if (get_loop_info (subloop).worth_versioning_p ())
1609               add_loop_to_queue (subloop);
1610         }
1611     }
1612
1613   return !m_loops_to_version.is_empty ();
1614 }
1615
1616 /* Attempt to implement loop versioning for LOOP, using the information
1617    cached in the associated loop_info.  Return true on success.  */
1618
1619 bool
1620 loop_versioning::version_loop (struct loop *loop)
1621 {
1622   loop_info &li = get_loop_info (loop);
1623
1624   /* Build up a condition that selects the original loop instead of
1625      the simplified loop.  */
1626   tree cond = boolean_false_node;
1627   bitmap_iterator bi;
1628   unsigned int i;
1629   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1630     {
1631       tree name = ssa_name (i);
1632       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1633                                  build_one_cst (TREE_TYPE (name)));
1634       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1635     }
1636
1637   /* Convert the condition into a suitable gcond.  */
1638   gimple_seq stmts = NULL;
1639   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1640
1641   /* Version the loop.  */
1642   initialize_original_copy_tables ();
1643   basic_block cond_bb;
1644   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1645                                     profile_probability::unlikely (),
1646                                     profile_probability::likely (),
1647                                     profile_probability::unlikely (),
1648                                     profile_probability::likely (), true);
1649   free_original_copy_tables ();
1650   if (!li.optimized_loop)
1651     {
1652       if (dump_enabled_p ())
1653         dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1654                          "tried but failed to version this loop for when"
1655                          " certain strides are 1\n");
1656       return false;
1657     }
1658
1659   if (dump_enabled_p ())
1660     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1661                      "versioned this loop for when certain strides are 1\n");
1662
1663   /* Insert the statements that feed COND.  */
1664   if (stmts)
1665     {
1666       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1667       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1668     }
1669
1670   return true;
1671 }
1672
1673 /* Attempt to version all loops in the versioning queue.  */
1674
1675 void
1676 loop_versioning::implement_versioning_decisions ()
1677 {
1678   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1679      user-facing at this point.  */
1680
1681   bool any_succeeded_p = false;
1682   struct loop *loop;
1683   unsigned int i;
1684   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1685     if (version_loop (loop))
1686       any_succeeded_p = true;
1687   if (!any_succeeded_p)
1688     return;
1689
1690   update_ssa (TODO_update_ssa);
1691
1692   /* Simplify the new loop, which is used when COND is false.  */
1693   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1694     {
1695       loop_info &linfo = get_loop_info (loop);
1696       if (linfo.optimized_loop)
1697         name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1698     }
1699 }
1700
1701 /* Run the pass and return a set of TODO_* flags.  */
1702
1703 unsigned int
1704 loop_versioning::run ()
1705 {
1706   gcc_assert (scev_initialized_p ());
1707
1708   if (analyze_blocks ()
1709       && prune_conditions ()
1710       && make_versioning_decisions ())
1711     implement_versioning_decisions ();
1712
1713   return 0;
1714 }
1715
1716 /* Loop versioning pass.  */
1717
1718 const pass_data pass_data_loop_versioning =
1719 {
1720   GIMPLE_PASS, /* type */
1721   "lversion", /* name */
1722   OPTGROUP_LOOP, /* optinfo_flags */
1723   TV_LOOP_VERSIONING, /* tv_id */
1724   PROP_cfg, /* properties_required */
1725   0, /* properties_provided */
1726   0, /* properties_destroyed */
1727   0, /* todo_flags_start */
1728   0, /* todo_flags_finish */
1729 };
1730
1731 class pass_loop_versioning : public gimple_opt_pass
1732 {
1733 public:
1734   pass_loop_versioning (gcc::context *ctxt)
1735     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1736   {}
1737
1738   /* opt_pass methods: */
1739   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1740   virtual unsigned int execute (function *);
1741 };
1742
1743 unsigned int
1744 pass_loop_versioning::execute (function *fn)
1745 {
1746   if (number_of_loops (fn) <= 1)
1747     return 0;
1748
1749   return loop_versioning (fn).run ();
1750 }
1751
1752 } // anon namespace
1753
1754 gimple_opt_pass *
1755 make_pass_loop_versioning (gcc::context *ctxt)
1756 {
1757   return new pass_loop_versioning (ctxt);
1758 }