re PR fortran/81849 (Size of automatic array argument specified by host-associated...
[official-gcc.git] / gcc / gimple-loop-versioning.cc
blobf5e674ba46619b975a8ffbced3f5dace89ff1881
1 /* Loop versioning pass.
2 Copyright (C) 2018-2019 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "tree.h"
25 #include "gimple.h"
26 #include "gimple-iterator.h"
27 #include "tree-pass.h"
28 #include "gimplify-me.h"
29 #include "cfgloop.h"
30 #include "tree-ssa-loop.h"
31 #include "ssa.h"
32 #include "tree-scalar-evolution.h"
33 #include "tree-chrec.h"
34 #include "tree-ssa-loop-ivopts.h"
35 #include "fold-const.h"
36 #include "tree-ssa-propagate.h"
37 #include "tree-inline.h"
38 #include "domwalk.h"
39 #include "alloc-pool.h"
40 #include "vr-values.h"
41 #include "gimple-ssa-evrp-analyze.h"
42 #include "tree-vectorizer.h"
43 #include "omp-general.h"
44 #include "predict.h"
45 #include "tree-into-ssa.h"
46 #include "params.h"
48 namespace {
50 /* This pass looks for loops that could be simplified if certain loop
51 invariant conditions were true. It is effectively a form of loop
52 splitting in which the pass produces the split conditions itself,
53 instead of using ones that are already present in the IL.
55 Versioning for when strides are 1
56 ---------------------------------
58 At the moment the only thing the pass looks for are memory references
59 like:
61 for (auto i : ...)
62 ...x[i * stride]...
64 It considers changing such loops to:
66 if (stride == 1)
67 for (auto i : ...) [A]
68 ...x[i]...
69 else
70 for (auto i : ...) [B]
71 ...x[i * stride]...
73 This can have several benefits:
75 (1) [A] is often easier or cheaper to vectorize than [B].
77 (2) The scalar code in [A] is simpler than the scalar code in [B]
78 (if the loops cannot be vectorized or need an epilogue loop).
80 (3) We might recognize [A] as a pattern, such as a memcpy or memset.
82 (4) [A] has simpler address evolutions, which can help other passes
83 like loop interchange.
85 The optimization is particularly useful for assumed-shape arrays in
86 Fortran, where the stride of the innermost dimension depends on the
87 array descriptor but is often equal to 1 in practice. For example:
89 subroutine f1(x)
90 real :: x(:)
91 x(:) = 100
92 end subroutine f1
94 generates the equivalent of:
96 raw_stride = *x.dim[0].stride;
97 stride = raw_stride != 0 ? raw_stride : 1;
98 x_base = *x.data;
99 ...
100 tmp1 = stride * S;
101 tmp2 = tmp1 - stride;
102 *x_base[tmp2] = 1.0e+2;
104 but in the common case that stride == 1, the last three statements
105 simplify to:
107 tmp3 = S + -1;
108 *x_base[tmp3] = 1.0e+2;
110 The optimization is in principle very simple. The difficult parts are:
112 (a) deciding which parts of a general address calculation correspond
113 to the inner dimension of an array, since this usually isn't explicit
114 in the IL, and for C often isn't even explicit in the source code
116 (b) estimating when the transformation is worthwhile
118 Structure
119 ---------
121 The pass has four phases:
123 (1) Walk through the statements looking for and recording potential
124 versioning opportunities. Stop if there are none.
126 (2) Use context-sensitive range information to see whether any versioning
127 conditions are impossible in practice. Remove them if so, and stop
128 if no opportunities remain.
130 (We do this only after (1) to keep compile time down when no
131 versioning opportunities exist.)
133 (3) Apply the cost model. Decide which versioning opportunities are
134 worthwhile and at which nesting level they should be applied.
136 (4) Attempt to version all the loops selected by (3), so that:
138 for (...)
141 becomes:
143 if (!cond)
144 for (...) // Original loop
146 else
147 for (...) // New loop
150 Use the version condition COND to simplify the new loop. */
152 /* Enumerates the likelihood that a particular value indexes the inner
153 dimension of an array. */
154 enum inner_likelihood {
155 INNER_UNLIKELY,
156 INNER_DONT_KNOW,
157 INNER_LIKELY
160 /* Information about one term of an address_info. */
161 struct address_term_info
163 /* The value of the term is EXPR * MULTIPLIER. */
164 tree expr;
165 unsigned HOST_WIDE_INT multiplier;
167 /* The stride applied by EXPR in each iteration of some unrecorded loop,
168 or null if no stride has been identified. */
169 tree stride;
171 /* Enumerates the likelihood that EXPR indexes the inner dimension
172 of an array. */
173 enum inner_likelihood inner_likelihood;
175 /* True if STRIDE == 1 is a versioning opportunity when considered
176 in isolation. */
177 bool versioning_opportunity_p;
180 /* Information about an address calculation, and the range of constant
181 offsets applied to it. */
182 struct address_info
184 static const unsigned int MAX_TERMS = 8;
186 /* One statement that calculates the address. If multiple statements
187 share the same address, we only record the first. */
188 gimple *stmt;
190 /* The loop containing STMT (cached for convenience). If multiple
191 statements share the same address, they all belong to this loop. */
192 struct loop *loop;
194 /* A decomposition of the calculation into a sum of terms plus an
195 optional base. When BASE is provided, it is never an SSA name.
196 Once initialization is complete, all members of TERMs are SSA names. */
197 tree base;
198 auto_vec<address_term_info, MAX_TERMS> terms;
200 /* All bytes accessed from the address fall in the offset range
201 [MIN_OFFSET, MAX_OFFSET). */
202 HOST_WIDE_INT min_offset, max_offset;
205 /* Stores addresses based on their base and terms (ignoring the offsets). */
206 struct address_info_hasher : nofree_ptr_hash <address_info>
208 static hashval_t hash (const address_info *);
209 static bool equal (const address_info *, const address_info *);
212 /* Information about the versioning we'd like to apply to a loop. */
213 struct loop_info
215 bool worth_versioning_p () const;
217 /* True if we've decided not to version this loop. The remaining
218 fields are meaningless if so. */
219 bool rejected_p;
221 /* True if at least one subloop of this loop benefits from versioning. */
222 bool subloops_benefit_p;
224 /* An estimate of the total number of instructions in the loop,
225 excluding those in subloops that benefit from versioning. */
226 unsigned int num_insns;
228 /* The outermost loop that can handle all the version checks
229 described below. */
230 struct loop *outermost;
232 /* The first entry in the list of blocks that belong to this loop
233 (and not to subloops). m_next_block_in_loop provides the chain
234 pointers for the list. */
235 basic_block block_list;
237 /* We'd like to version the loop for the case in which these SSA names
238 (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */
239 bitmap_head unity_names;
241 /* If versioning succeeds, this points the version of the loop that
242 assumes the version conditions holds. */
243 struct loop *optimized_loop;
246 /* The main pass structure. */
247 class loop_versioning
249 public:
250 loop_versioning (function *);
251 ~loop_versioning ();
252 unsigned int run ();
254 private:
255 /* Used to walk the dominator tree to find loop versioning conditions
256 that are always false. */
257 class lv_dom_walker : public dom_walker
259 public:
260 lv_dom_walker (loop_versioning &);
262 edge before_dom_children (basic_block) FINAL OVERRIDE;
263 void after_dom_children (basic_block) FINAL OVERRIDE;
265 private:
266 /* The parent pass. */
267 loop_versioning &m_lv;
269 /* Used to build context-dependent range information. */
270 evrp_range_analyzer m_range_analyzer;
273 /* Used to simplify statements based on conditions that are established
274 by the version checks. */
275 class name_prop : public substitute_and_fold_engine
277 public:
278 name_prop (loop_info &li) : m_li (li) {}
279 tree get_value (tree) FINAL OVERRIDE;
281 private:
282 /* Information about the versioning we've performed on the loop. */
283 loop_info &m_li;
286 loop_info &get_loop_info (struct loop *loop) { return m_loops[loop->num]; }
288 unsigned int max_insns_for_loop (struct loop *);
289 bool expensive_stmt_p (gimple *);
291 void version_for_unity (gimple *, tree);
292 bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
293 unsigned HOST_WIDE_INT * = 0);
294 bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
295 bool multiply_term_by (address_term_info &, tree);
296 inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
297 void analyze_stride (address_info &, address_term_info &,
298 tree, struct loop *);
299 bool find_per_loop_multiplication (address_info &, address_term_info &);
300 void analyze_term_using_scevs (address_info &, address_term_info &);
301 void analyze_address_fragment (address_info &);
302 void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
303 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
304 void analyze_expr (gimple *, tree);
305 bool analyze_block (basic_block);
306 bool analyze_blocks ();
308 void prune_loop_conditions (struct loop *, vr_values *);
309 bool prune_conditions ();
311 void merge_loop_info (struct loop *, struct loop *);
312 void add_loop_to_queue (struct loop *);
313 bool decide_whether_loop_is_versionable (struct loop *);
314 bool make_versioning_decisions ();
316 bool version_loop (struct loop *);
317 void implement_versioning_decisions ();
319 /* The function we're optimizing. */
320 function *m_fn;
322 /* The obstack to use for all pass-specific bitmaps. */
323 bitmap_obstack m_bitmap_obstack;
325 /* An obstack to use for general allocation. */
326 obstack m_obstack;
328 /* The number of loops in the function. */
329 unsigned int m_nloops;
331 /* The total number of loop version conditions we've found. */
332 unsigned int m_num_conditions;
334 /* Assume that an address fragment of the form i * stride * scale
335 (for variable stride and constant scale) will not benefit from
336 versioning for stride == 1 when scale is greater than this value. */
337 unsigned HOST_WIDE_INT m_maximum_scale;
339 /* Information about each loop. */
340 auto_vec<loop_info> m_loops;
342 /* Used to form a linked list of blocks that belong to a loop,
343 started by loop_info::block_list. */
344 auto_vec<basic_block> m_next_block_in_loop;
346 /* The list of loops that we've decided to version. */
347 auto_vec<struct loop *> m_loops_to_version;
349 /* A table of addresses in the current loop, keyed off their values
350 but not their offsets. */
351 hash_table <address_info_hasher> m_address_table;
353 /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */
354 auto_vec <address_info *, 32> m_address_list;
357 /* If EXPR is an SSA name and not a default definition, return the
358 defining statement, otherwise return null. */
360 static gimple *
361 maybe_get_stmt (tree expr)
363 if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
364 return SSA_NAME_DEF_STMT (expr);
365 return NULL;
368 /* Like maybe_get_stmt, but also return null if the defining
369 statement isn't an assignment. */
371 static gassign *
372 maybe_get_assign (tree expr)
374 return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
377 /* Return true if this pass should look through a cast of expression FROM
378 to type TYPE when analyzing pieces of an address. */
380 static bool
381 look_through_cast_p (tree type, tree from)
383 return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
384 && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
387 /* Strip all conversions of integers or pointers from EXPR, regardless
388 of whether the conversions are nops. This is useful in the context
389 of this pass because we're not trying to fold or simulate the
390 expression; we just want to see how it's structured. */
392 static tree
393 strip_casts (tree expr)
395 const unsigned int MAX_NITERS = 4;
397 tree type = TREE_TYPE (expr);
398 while (CONVERT_EXPR_P (expr)
399 && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
400 expr = TREE_OPERAND (expr, 0);
402 for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
404 gassign *assign = maybe_get_assign (expr);
405 if (assign
406 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
407 && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
408 expr = gimple_assign_rhs1 (assign);
409 else
410 break;
412 return expr;
415 /* Compare two address_term_infos in the same address_info. */
417 static int
418 compare_address_terms (const void *a_uncast, const void *b_uncast)
420 const address_term_info *a = (const address_term_info *) a_uncast;
421 const address_term_info *b = (const address_term_info *) b_uncast;
423 if (a->expr != b->expr)
424 return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
426 if (a->multiplier != b->multiplier)
427 return a->multiplier < b->multiplier ? -1 : 1;
429 return 0;
432 /* Dump ADDRESS using flags FLAGS. */
434 static void
435 dump_address_info (dump_flags_t flags, address_info &address)
437 if (address.base)
438 dump_printf (flags, "%T + ", address.base);
439 for (unsigned int i = 0; i < address.terms.length (); ++i)
441 if (i != 0)
442 dump_printf (flags, " + ");
443 dump_printf (flags, "%T", address.terms[i].expr);
444 if (address.terms[i].multiplier != 1)
445 dump_printf (flags, " * %wd", address.terms[i].multiplier);
447 dump_printf (flags, " + [%wd, %wd]",
448 address.min_offset, address.max_offset - 1);
451 /* Hash an address_info based on its base and terms. */
453 hashval_t
454 address_info_hasher::hash (const address_info *info)
456 inchash::hash hash;
457 hash.add_int (info->base ? TREE_CODE (info->base) : 0);
458 hash.add_int (info->terms.length ());
459 for (unsigned int i = 0; i < info->terms.length (); ++i)
461 hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
462 hash.add_hwi (info->terms[i].multiplier);
464 return hash.end ();
467 /* Return true if two address_infos have equal bases and terms. Other
468 properties might be different (such as the statement or constant
469 offset range). */
471 bool
472 address_info_hasher::equal (const address_info *a, const address_info *b)
474 if (a->base != b->base
475 && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
476 return false;
478 if (a->terms.length () != b->terms.length ())
479 return false;
481 for (unsigned int i = 0; i < a->terms.length (); ++i)
482 if (a->terms[i].expr != b->terms[i].expr
483 || a->terms[i].multiplier != b->terms[i].multiplier)
484 return false;
486 return true;
489 /* Return true if we want to version the loop, i.e. if we have a
490 specific reason for doing so and no specific reason not to. */
492 bool
493 loop_info::worth_versioning_p () const
495 return (!rejected_p
496 && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
499 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
500 : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
504 /* Process BB before processing the blocks it dominates. */
506 edge
507 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
509 m_range_analyzer.enter (bb);
511 if (bb == bb->loop_father->header)
512 m_lv.prune_loop_conditions (bb->loop_father,
513 m_range_analyzer.get_vr_values ());
515 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
516 gsi_next (&si))
517 m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
519 return NULL;
522 /* Process BB after processing the blocks it dominates. */
524 void
525 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
527 m_range_analyzer.leave (bb);
530 /* Decide whether to replace VAL with a new value in a versioned loop.
531 Return the new value if so, otherwise return null. */
533 tree
534 loop_versioning::name_prop::get_value (tree val)
536 if (TREE_CODE (val) == SSA_NAME
537 && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
538 return build_one_cst (TREE_TYPE (val));
539 return NULL_TREE;
542 /* Initialize the structure to optimize FN. */
544 loop_versioning::loop_versioning (function *fn)
545 : m_fn (fn),
546 m_nloops (number_of_loops (fn)),
547 m_num_conditions (0),
548 m_address_table (31)
550 bitmap_obstack_initialize (&m_bitmap_obstack);
551 gcc_obstack_init (&m_obstack);
553 /* Initialize the loop information. */
554 m_loops.safe_grow_cleared (m_nloops);
555 for (unsigned int i = 0; i < m_nloops; ++i)
557 m_loops[i].outermost = get_loop (m_fn, 0);
558 bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
561 /* Initialize the list of blocks that belong to each loop. */
562 unsigned int nbbs = last_basic_block_for_fn (fn);
563 m_next_block_in_loop.safe_grow (nbbs);
564 basic_block bb;
565 FOR_EACH_BB_FN (bb, fn)
567 loop_info &li = get_loop_info (bb->loop_father);
568 m_next_block_in_loop[bb->index] = li.block_list;
569 li.block_list = bb;
572 /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
573 unvectorizable code, since it is the largest size that can be
574 handled efficiently by scalar code. omp_max_vf calculates the
575 maximum number of bytes in a vector, when such a value is relevant
576 to loop optimization. */
577 m_maximum_scale = estimated_poly_value (omp_max_vf ());
578 m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
581 loop_versioning::~loop_versioning ()
583 bitmap_obstack_release (&m_bitmap_obstack);
584 obstack_free (&m_obstack, NULL);
587 /* Return the maximum number of instructions allowed in LOOP before
588 it becomes too big for versioning.
590 There are separate limits for inner and outer loops. The limit for
591 inner loops applies only to loops that benefit directly from versioning.
592 The limit for outer loops applies to all code in the outer loop and
593 its subloops that *doesn't* benefit directly from versioning; such code
594 would be "taken along for the ride". The idea is that if the cost of
595 the latter is small, it is better to version outer loops rather than
596 inner loops, both to reduce the number of repeated checks and to enable
597 more of the loop nest to be optimized as a natural nest (e.g. by loop
598 interchange or outer-loop vectorization). */
600 unsigned int
601 loop_versioning::max_insns_for_loop (struct loop *loop)
603 return (loop->inner
604 ? PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_OUTER_INSNS)
605 : PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_INNER_INSNS));
608 /* Return true if for cost reasons we should avoid versioning any loop
609 that contains STMT.
611 Note that we don't need to check whether versioning is invalid for
612 correctness reasons, since the versioning process does that for us.
613 The conditions involved are too rare to be worth duplicating here. */
615 bool
616 loop_versioning::expensive_stmt_p (gimple *stmt)
618 if (gcall *call = dyn_cast <gcall *> (stmt))
619 /* Assume for now that the time spent in an "expensive" call would
620 overwhelm any saving from versioning. */
621 return !gimple_inexpensive_call_p (call);
622 return false;
625 /* Record that we want to version the loop that contains STMT for the
626 case in which SSA name NAME is equal to 1. We already know that NAME
627 is invariant in the loop. */
629 void
630 loop_versioning::version_for_unity (gimple *stmt, tree name)
632 struct loop *loop = loop_containing_stmt (stmt);
633 loop_info &li = get_loop_info (loop);
635 if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
637 /* This is the first time we've wanted to version LOOP for NAME.
638 Keep track of the outermost loop that can handle all versioning
639 checks in LI. */
640 struct loop *outermost
641 = outermost_invariant_loop_for_expr (loop, name);
642 if (loop_depth (li.outermost) < loop_depth (outermost))
643 li.outermost = outermost;
645 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
648 " for when %T == 1", name);
649 if (outermost == loop)
650 dump_printf (MSG_NOTE, "; cannot hoist check further");
651 else
653 dump_printf (MSG_NOTE, "; could implement the check at loop"
654 " depth %d", loop_depth (outermost));
655 if (loop_depth (li.outermost) > loop_depth (outermost))
656 dump_printf (MSG_NOTE, ", but other checks only allow"
657 " a depth of %d", loop_depth (li.outermost));
659 dump_printf (MSG_NOTE, "\n");
662 m_num_conditions += 1;
664 else
666 /* This is a duplicate request. */
667 if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
669 " loop for when %T == 1\n", name);
673 /* Return true if OP1_TREE is constant and if in principle it is worth
674 versioning an address fragment of the form:
676 i * OP1_TREE * OP2 * stride
678 for the case in which stride == 1. This in practice means testing
679 whether:
681 OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
683 If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */
685 bool
686 loop_versioning::acceptable_multiplier_p (tree op1_tree,
687 unsigned HOST_WIDE_INT op2,
688 unsigned HOST_WIDE_INT *result)
690 if (tree_fits_uhwi_p (op1_tree))
692 unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
693 /* The first part checks for overflow. */
694 if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
696 if (result)
697 *result = op1 * op2;
698 return true;
701 return false;
704 /* Return true if it is worth using loop versioning on a memory access
705 of type TYPE. Store the size of the access in *SIZE if so. */
707 bool
708 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
710 return (TYPE_SIZE_UNIT (type)
711 && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
714 /* See whether OP is constant and whether we can multiply TERM by that
715 constant without exceeding M_MAXIMUM_SCALE. Return true and update
716 TERM if so. */
718 bool
719 loop_versioning::multiply_term_by (address_term_info &term, tree op)
721 return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
724 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
725 is likely to be indexing an innermost dimension, returning the result
726 as an INNER_* probability. */
728 inner_likelihood
729 loop_versioning::get_inner_likelihood (tree stride,
730 unsigned HOST_WIDE_INT multiplier)
732 const unsigned int MAX_NITERS = 8;
734 /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at
735 least one of those values is likely to be for the innermost dimension.
736 Record in UNLIKELY_P if at least one of those values is unlikely to be
737 for the innermost dimension.
739 E.g. for:
741 stride = cond ? a * b : 1
743 we should treat STRIDE as being a likely inner dimension, since
744 we know that it is 1 under at least some circumstances. (See the
745 Fortran example below.) However:
747 stride = a * b
749 on its own is unlikely to be for the innermost dimension, since
750 that would require both a and b to be 1 at runtime. */
751 bool unlikely_p = false;
752 tree worklist[MAX_NITERS];
753 unsigned int length = 0;
754 worklist[length++] = stride;
755 for (unsigned int i = 0; i < length; ++i)
757 tree expr = worklist[i];
759 if (CONSTANT_CLASS_P (expr))
761 /* See if EXPR * MULTIPLIER would be consistent with an individual
762 access or a small grouped access. */
763 if (acceptable_multiplier_p (expr, multiplier))
764 return INNER_LIKELY;
765 else
766 unlikely_p = true;
768 else if (gimple *stmt = maybe_get_stmt (expr))
770 /* If EXPR is set by a PHI node, queue its arguments in case
771 we find one that is consistent with an inner dimension.
773 An important instance of this is the Fortran handling of array
774 descriptors, which calculates the stride of the inner dimension
775 using a PHI equivalent of:
777 raw_stride = a.dim[0].stride;
778 stride = raw_stride != 0 ? raw_stride : 1;
780 (Strides for outer dimensions do not treat 0 specially.) */
781 if (gphi *phi = dyn_cast <gphi *> (stmt))
783 unsigned int nargs = gimple_phi_num_args (phi);
784 for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
785 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
787 /* If the value is set by an assignment, expect it to be read
788 from memory (such as an array descriptor) rather than be
789 calculated. */
790 else if (gassign *assign = dyn_cast <gassign *> (stmt))
792 if (!gimple_assign_load_p (assign))
793 unlikely_p = true;
795 /* Things like calls don't really tell us anything. */
799 /* We didn't find any possible values of STRIDE that were likely to be
800 for the innermost dimension. If we found one that was actively
801 unlikely to be for the innermost dimension, assume that that applies
802 to STRIDE too. */
803 return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
806 /* The caller has identified that STRIDE is the stride of interest
807 in TERM, and that the stride is applied in OP_LOOP. Record this
808 information in TERM, deciding whether STRIDE is likely to be for
809 the innermost dimension of an array and whether it represents a
810 versioning opportunity. ADDRESS is the address that contains TERM. */
812 void
813 loop_versioning::analyze_stride (address_info &address,
814 address_term_info &term,
815 tree stride, struct loop *op_loop)
817 term.stride = stride;
819 term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
820 if (dump_enabled_p ())
822 if (term.inner_likelihood == INNER_LIKELY)
823 dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
824 " innermost dimension\n", stride);
825 else if (term.inner_likelihood == INNER_UNLIKELY)
826 dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
827 " innermost dimension\n", stride);
828 else
829 dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
830 " is the innermost dimension\n", stride);
833 /* To be a versioning opportunity we require:
835 - The multiplier applied by TERM is equal to the access size,
836 so that when STRIDE is 1, the accesses in successive loop
837 iterations are consecutive.
839 This is deliberately conservative. We could relax it to handle
840 other cases (such as those with gaps between iterations) if we
841 find any real testcases for which it's useful.
843 - the stride is applied in the same loop as STMT rather than
844 in an outer loop. Although versioning for strides applied in
845 outer loops could help in some cases -- such as enabling
846 more loop interchange -- the savings are much lower than for
847 inner loops.
849 - the stride is an SSA name that is invariant in STMT's loop,
850 since otherwise versioning isn't possible. */
851 unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
852 if (term.multiplier == access_size
853 && address.loop == op_loop
854 && TREE_CODE (stride) == SSA_NAME
855 && expr_invariant_in_loop_p (address.loop, stride))
857 term.versioning_opportunity_p = true;
858 if (dump_enabled_p ())
859 dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
860 " opportunity\n", stride);
864 /* See whether address term TERM (which belongs to ADDRESS) is the result
865 of multiplying a varying SSA name by a loop-invariant SSA name.
866 Return true and update TERM if so.
868 This handles both cases that SCEV might handle, such as:
870 for (int i = 0; i < n; ++i)
871 res += a[i * stride];
873 and ones in which the term varies arbitrarily between iterations, such as:
875 for (int i = 0; i < n; ++i)
876 res += a[index[i] * stride]; */
878 bool
879 loop_versioning::find_per_loop_multiplication (address_info &address,
880 address_term_info &term)
882 gimple *mult = maybe_get_assign (term.expr);
883 if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
884 return false;
886 struct loop *mult_loop = loop_containing_stmt (mult);
887 if (!loop_outer (mult_loop))
888 return false;
890 tree op1 = strip_casts (gimple_assign_rhs1 (mult));
891 tree op2 = strip_casts (gimple_assign_rhs2 (mult));
892 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
893 return false;
895 bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
896 bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
897 if (invariant1_p == invariant2_p)
898 return false;
900 /* Make sure that the loop invariant is OP2 rather than OP1. */
901 if (invariant1_p)
902 std::swap (op1, op2);
904 if (dump_enabled_p ())
905 dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
906 " * loop-invariant %T\n", term.expr, op1, op2);
907 analyze_stride (address, term, op2, mult_loop);
908 return true;
911 /* Try to use scalar evolutions to find an address stride for TERM,
912 which belongs to ADDRESS.
914 Here we are interested in any evolution information we can find,
915 not just evolutions wrt ADDRESS->LOOP. For example, if we find that
916 an outer loop obviously iterates over the inner dimension of an array,
917 that information can help us eliminate worthless versioning opportunities
918 in inner loops. */
920 void
921 loop_versioning::analyze_term_using_scevs (address_info &address,
922 address_term_info &term)
924 gimple *setter = maybe_get_stmt (term.expr);
925 if (!setter)
926 return;
928 struct loop *wrt_loop = loop_containing_stmt (setter);
929 if (!loop_outer (wrt_loop))
930 return;
932 tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
933 if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
935 if (dump_enabled_p ())
936 dump_printf_loc (MSG_NOTE, address.stmt,
937 "address term %T = %T\n", term.expr, chrec);
939 /* Peel casts and accumulate constant multiplications, up to the
940 limit allowed by M_MAXIMUM_SCALE. */
941 tree stride = strip_casts (CHREC_RIGHT (chrec));
942 while (TREE_CODE (stride) == MULT_EXPR
943 && multiply_term_by (term, TREE_OPERAND (stride, 1)))
944 stride = strip_casts (TREE_OPERAND (stride, 0));
946 gassign *assign;
947 while ((assign = maybe_get_assign (stride))
948 && gimple_assign_rhs_code (assign) == MULT_EXPR
949 && multiply_term_by (term, gimple_assign_rhs2 (assign)))
951 if (dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE, address.stmt,
953 "looking through %G", assign);
954 stride = strip_casts (gimple_assign_rhs1 (assign));
957 analyze_stride (address, term, stride, get_chrec_loop (chrec));
961 /* Try to identify loop strides in ADDRESS and try to choose realistic
962 versioning opportunities based on these strides.
964 The main difficulty here isn't finding strides that could be used
965 in a version check (that's pretty easy). The problem instead is to
966 avoid versioning for some stride S that is unlikely ever to be 1 at
967 runtime. Versioning for S == 1 on its own would lead to unnecessary
968 code bloat, while adding S == 1 to more realistic version conditions
969 would lose the optimisation opportunity offered by those other conditions.
971 For example, versioning for a stride of 1 in the Fortran code:
973 integer :: a(:,:)
974 a(1,:) = 1
976 is not usually a good idea, since the assignment is iterating over
977 an outer dimension and is relatively unlikely to have a stride of 1.
978 (It isn't impossible, since the inner dimension might be 1, or the
979 array might be transposed.) Similarly, in:
981 integer :: a(:,:), b(:,:)
982 b(:,1) = a(1,:)
984 b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
985 Versioning for when both strides are 1 would lose most of the benefit
986 of versioning for b's access.
988 The approach we take is as follows:
990 - Analyze each term to see whether it has an identifiable stride,
991 regardless of which loop applies the stride.
993 - Evaluate the likelihood that each such stride is for the innermost
994 dimension of an array, on the scale "likely", "don't know" or "unlikely".
996 - If there is a single "likely" innermost stride, and that stride is
997 applied in the loop that contains STMT, version the loop for when the
998 stride is 1. This deals with the cases in which we're fairly
999 confident of doing the right thing, such as the b(:,1) reference above.
1001 - If there are no "likely" innermost strides, and the loop that contains
1002 STMT uses a stride that we rated as "don't know", version for when
1003 that stride is 1. This is principally used for C code such as:
1005 for (int i = 0; i < n; ++i)
1006 a[i * x] = ...;
1008 and:
1010 for (int j = 0; j < n; ++j)
1011 for (int i = 0; i < n; ++i)
1012 a[i * x + j * y] = ...;
1014 where nothing in the way "x" and "y" are set gives a hint as to
1015 whether "i" iterates over the innermost dimension of the array.
1016 In these situations it seems reasonable to assume the the
1017 programmer has nested the loops appropriately (although of course
1018 there are examples like GEMM in which this assumption doesn't hold
1019 for all accesses in the loop).
1021 This case is also useful for the Fortran equivalent of the
1022 above C code. */
1024 void
1025 loop_versioning::analyze_address_fragment (address_info &address)
1027 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1030 dump_address_info (MSG_NOTE, address);
1031 dump_printf (MSG_NOTE, "\n");
1034 /* Analyze each component of the sum to see whether it involves an
1035 apparent stride.
1037 There is an overlap between the addresses that
1038 find_per_loop_multiplication and analyze_term_using_scevs can handle,
1039 but the former is much cheaper than SCEV analysis, so try it first. */
1040 for (unsigned int i = 0; i < address.terms.length (); ++i)
1041 if (!find_per_loop_multiplication (address, address.terms[i]))
1042 analyze_term_using_scevs (address, address.terms[i]);
1044 /* Check for strides that are likely to be for the innermost dimension.
1046 1. If there is a single likely inner stride, if it is an SSA name,
1047 and if it is worth versioning the loop for when the SSA name
1048 equals 1, record that we want to do so.
1050 2. Otherwise, if there any likely inner strides, bail out. This means
1051 one of:
1053 (a) There are multiple likely inner strides. This suggests we're
1054 confused and be can't be confident of doing the right thing.
1056 (b) There is a single likely inner stride and it is a constant
1057 rather than an SSA name. This can mean either that the access
1058 is a natural one without any variable strides, such as:
1060 for (int i = 0; i < n; ++i)
1061 a[i] += 1;
1063 or that a variable stride is applied to an outer dimension,
1064 such as:
1066 for (int i = 0; i < n; ++i)
1067 for (int j = 0; j < n; ++j)
1068 a[j * stride][i] += 1;
1070 (c) There is a single likely inner stride, and it is an SSA name,
1071 but it isn't a worthwhile versioning opportunity. This usually
1072 means that the variable stride is applied by an outer loop,
1073 such as:
1075 for (int i = 0; i < n; ++i)
1076 for (int j = 0; j < n; ++j)
1077 a[j][i * stride] += 1;
1079 or (using an example with a more natural loop nesting):
1081 for (int i = 0; i < n; ++i)
1082 for (int j = 0; j < n; ++j)
1083 a[i][j] += b[i * stride];
1085 in cases where b[i * stride] cannot (yet) be hoisted for
1086 aliasing reasons.
1088 3. If there are no likely inner strides, fall through to the next
1089 set of checks.
1091 Pointer equality is enough to check for uniqueness in (1), since we
1092 only care about SSA names. */
1093 tree chosen_stride = NULL_TREE;
1094 tree version_stride = NULL_TREE;
1095 for (unsigned int i = 0; i < address.terms.length (); ++i)
1096 if (chosen_stride != address.terms[i].stride
1097 && address.terms[i].inner_likelihood == INNER_LIKELY)
1099 if (chosen_stride)
1100 return;
1101 chosen_stride = address.terms[i].stride;
1102 if (address.terms[i].versioning_opportunity_p)
1103 version_stride = chosen_stride;
1106 /* If there are no likely inner strides, see if there is a single
1107 versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1108 See the comment above the function for the cases that this code
1109 handles. */
1110 if (!chosen_stride)
1111 for (unsigned int i = 0; i < address.terms.length (); ++i)
1112 if (version_stride != address.terms[i].stride
1113 && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1114 && address.terms[i].versioning_opportunity_p)
1116 if (version_stride)
1117 return;
1118 version_stride = address.terms[i].stride;
1121 if (version_stride)
1122 version_for_unity (address.stmt, version_stride);
1125 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1126 TYPE_SIZE bytes and record this address fragment for later processing.
1127 STMT is the statement that contains the address. */
1129 void
1130 loop_versioning::record_address_fragment (gimple *stmt,
1131 unsigned HOST_WIDE_INT type_size,
1132 tree expr,
1133 unsigned HOST_WIDE_INT multiplier,
1134 HOST_WIDE_INT offset)
1136 /* We're only interested in computed values. */
1137 if (TREE_CODE (expr) != SSA_NAME)
1138 return;
1140 /* Quick exit if no part of the address is calculated in STMT's loop,
1141 since such addresses have no versioning opportunities. */
1142 struct loop *loop = loop_containing_stmt (stmt);
1143 if (expr_invariant_in_loop_p (loop, expr))
1144 return;
1146 /* Set up an address_info for EXPR * MULTIPLIER. */
1147 address_info *address = XOBNEW (&m_obstack, address_info);
1148 new (address) address_info;
1149 address->stmt = stmt;
1150 address->loop = loop;
1151 address->base = NULL_TREE;
1152 address->terms.quick_grow (1);
1153 address->terms[0].expr = expr;
1154 address->terms[0].multiplier = multiplier;
1155 address->terms[0].stride = NULL_TREE;
1156 address->terms[0].inner_likelihood = INNER_UNLIKELY;
1157 address->terms[0].versioning_opportunity_p = false;
1158 address->min_offset = offset;
1160 /* Peel apart the expression into a sum of address_terms, where each
1161 term is multiplied by a constant. Treat a + b and a - b the same,
1162 since it doesn't matter for our purposes whether an address is
1163 increasing or decreasing. Distribute (a + b) * constant into
1164 a * constant + b * constant.
1166 We don't care which loop each term belongs to, since we want to
1167 examine as many candidate strides as possible when determining
1168 which is likely to be for the innermost dimension. We therefore
1169 don't limit the search to statements in STMT's loop. */
1170 for (unsigned int i = 0; i < address->terms.length (); )
1172 if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1174 tree_code code = gimple_assign_rhs_code (assign);
1175 if (code == PLUS_EXPR
1176 || code == POINTER_PLUS_EXPR
1177 || code == MINUS_EXPR)
1179 tree op1 = gimple_assign_rhs1 (assign);
1180 tree op2 = gimple_assign_rhs2 (assign);
1181 if (TREE_CODE (op2) == INTEGER_CST)
1183 address->terms[i].expr = strip_casts (op1);
1184 /* This is heuristic only, so don't worry about truncation
1185 or overflow. */
1186 address->min_offset += (TREE_INT_CST_LOW (op2)
1187 * address->terms[i].multiplier);
1188 continue;
1190 else if (address->terms.length () < address_info::MAX_TERMS)
1192 unsigned int j = address->terms.length ();
1193 address->terms.quick_push (address->terms[i]);
1194 address->terms[i].expr = strip_casts (op1);
1195 address->terms[j].expr = strip_casts (op2);
1196 continue;
1199 if (code == MULT_EXPR)
1201 tree op1 = gimple_assign_rhs1 (assign);
1202 tree op2 = gimple_assign_rhs2 (assign);
1203 if (multiply_term_by (address->terms[i], op2))
1205 address->terms[i].expr = strip_casts (op1);
1206 continue;
1210 i += 1;
1213 /* Peel off any symbolic pointer. */
1214 if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1215 && address->terms[0].multiplier == 1)
1217 if (address->terms.length () == 1)
1219 obstack_free (&m_obstack, address);
1220 return;
1222 address->base = address->terms[0].expr;
1223 address->terms.ordered_remove (0);
1226 /* Require all remaining terms to be SSA names. (This could be false
1227 for unfolded statements, but they aren't worth dealing with.) */
1228 for (unsigned int i = 0; i < address->terms.length (); ++i)
1229 if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1231 obstack_free (&m_obstack, address);
1232 return;
1235 /* The loop above set MIN_OFFSET based on the first byte of the
1236 referenced data. Calculate the end + 1. */
1237 address->max_offset = address->min_offset + type_size;
1239 /* Put the terms into a canonical order for the hash table lookup below. */
1240 address->terms.qsort (compare_address_terms);
1242 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1245 if (multiplier != 1)
1246 dump_printf (MSG_NOTE, " * %wd", multiplier);
1247 dump_printf (MSG_NOTE, " = ");
1248 dump_address_info (MSG_NOTE, *address);
1249 dump_printf (MSG_NOTE, "\n");
1252 /* Pool address information with the same terms (but potentially
1253 different offsets). */
1254 address_info **slot = m_address_table.find_slot (address, INSERT);
1255 if (address_info *old_address = *slot)
1257 /* We've already seen an address with the same terms. Extend the
1258 offset range to account for this access. Doing this can paper
1259 over gaps, such as in:
1261 a[i * stride * 4] + a[i * stride * 4 + 3];
1263 where nothing references "+ 1" or "+ 2". However, the vectorizer
1264 handles such gapped accesses without problems, so it's not worth
1265 trying to exclude them. */
1266 if (old_address->min_offset > address->min_offset)
1267 old_address->min_offset = address->min_offset;
1268 if (old_address->max_offset < address->max_offset)
1269 old_address->max_offset = address->max_offset;
1270 obstack_free (&m_obstack, address);
1272 else
1274 /* This is the first time we've seen an address with these terms. */
1275 *slot = address;
1276 m_address_list.safe_push (address);
1280 /* Analyze expression EXPR, which occurs in STMT. */
1282 void
1283 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1285 unsigned HOST_WIDE_INT type_size;
1287 while (handled_component_p (expr))
1289 /* See whether we can use versioning to avoid a multiplication
1290 in an array index. */
1291 if (TREE_CODE (expr) == ARRAY_REF
1292 && acceptable_type_p (TREE_TYPE (expr), &type_size))
1293 record_address_fragment (stmt, type_size,
1294 TREE_OPERAND (expr, 1), type_size, 0);
1295 expr = TREE_OPERAND (expr, 0);
1298 /* See whether we can use versioning to avoid a multiplication
1299 in the pointer calculation of a MEM_REF. */
1300 if (TREE_CODE (expr) == MEM_REF
1301 && acceptable_type_p (TREE_TYPE (expr), &type_size))
1302 record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1303 /* This is heuristic only, so don't worry
1304 about truncation or overflow. */
1305 TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1307 /* These would be easy to handle if they existed at this stage. */
1308 gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1311 /* Analyze all the statements in BB looking for useful version checks.
1312 Return true on success, false if something prevents the block from
1313 being versioned. */
1315 bool
1316 loop_versioning::analyze_block (basic_block bb)
1318 struct loop *loop = bb->loop_father;
1319 loop_info &li = get_loop_info (loop);
1320 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1321 gsi_next (&gsi))
1323 gimple *stmt = gsi_stmt (gsi);
1324 if (is_gimple_debug (stmt))
1325 continue;
1327 if (expensive_stmt_p (stmt))
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1331 " prevents versioning: %G", stmt);
1332 return false;
1335 /* Only look for direct versioning opportunities in inner loops
1336 since the benefit tends to be much smaller for outer loops. */
1337 if (!loop->inner)
1339 unsigned int nops = gimple_num_ops (stmt);
1340 for (unsigned int i = 0; i < nops; ++i)
1341 if (tree op = gimple_op (stmt, i))
1342 analyze_expr (stmt, op);
1345 /* The point of the instruction limit is to prevent excessive
1346 code growth, so this is a size-based estimate even though
1347 the optimization is aimed at speed. */
1348 li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1351 return true;
1354 /* Analyze all the blocks in the function, looking for useful version checks.
1355 Return true if we found one. */
1357 bool
1358 loop_versioning::analyze_blocks ()
1360 AUTO_DUMP_SCOPE ("analyze_blocks",
1361 dump_user_location_t::from_function_decl (m_fn->decl));
1363 /* For now we don't try to version the whole function, although
1364 versioning at that level could be useful in some cases. */
1365 get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1367 struct loop *loop;
1368 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1370 loop_info &linfo = get_loop_info (loop);
1372 /* Ignore cold loops. */
1373 if (!optimize_loop_for_speed_p (loop))
1374 linfo.rejected_p = true;
1376 /* See whether an inner loop prevents versioning of this loop. */
1377 if (!linfo.rejected_p)
1378 for (struct loop *inner = loop->inner; inner; inner = inner->next)
1379 if (get_loop_info (inner).rejected_p)
1381 linfo.rejected_p = true;
1382 break;
1385 /* If versioning the loop is still a possibility, examine the
1386 statements in the loop to look for versioning opportunities. */
1387 if (!linfo.rejected_p)
1389 void *start_point = obstack_alloc (&m_obstack, 0);
1391 for (basic_block bb = linfo.block_list; bb;
1392 bb = m_next_block_in_loop[bb->index])
1393 if (!analyze_block (bb))
1395 linfo.rejected_p = true;
1396 break;
1399 if (!linfo.rejected_p)
1401 /* Process any queued address fragments, now that we have
1402 complete grouping information. */
1403 address_info *address;
1404 unsigned int i;
1405 FOR_EACH_VEC_ELT (m_address_list, i, address)
1406 analyze_address_fragment (*address);
1409 m_address_table.empty ();
1410 m_address_list.truncate (0);
1411 obstack_free (&m_obstack, start_point);
1415 return m_num_conditions != 0;
1418 /* Use the ranges in VRS to remove impossible versioning conditions from
1419 LOOP. */
1421 void
1422 loop_versioning::prune_loop_conditions (struct loop *loop, vr_values *vrs)
1424 loop_info &li = get_loop_info (loop);
1426 int to_remove = -1;
1427 bitmap_iterator bi;
1428 unsigned int i;
1429 EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1431 tree name = ssa_name (i);
1432 value_range *vr = vrs->get_value_range (name);
1433 if (vr && !range_includes_p (vr, 1))
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1437 "%T can never be 1 in this loop\n", name);
1439 if (to_remove >= 0)
1440 bitmap_clear_bit (&li.unity_names, to_remove);
1441 to_remove = i;
1442 m_num_conditions -= 1;
1445 if (to_remove >= 0)
1446 bitmap_clear_bit (&li.unity_names, to_remove);
1449 /* Remove any scheduled loop version conditions that will never be true.
1450 Return true if any remain. */
1452 bool
1453 loop_versioning::prune_conditions ()
1455 AUTO_DUMP_SCOPE ("prune_loop_conditions",
1456 dump_user_location_t::from_function_decl (m_fn->decl));
1458 calculate_dominance_info (CDI_DOMINATORS);
1459 lv_dom_walker dom_walker (*this);
1460 dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1461 return m_num_conditions != 0;
1464 /* Merge the version checks for INNER into immediately-enclosing loop
1465 OUTER. */
1467 void
1468 loop_versioning::merge_loop_info (struct loop *outer, struct loop *inner)
1470 loop_info &inner_li = get_loop_info (inner);
1471 loop_info &outer_li = get_loop_info (outer);
1473 if (dump_enabled_p ())
1475 bitmap_iterator bi;
1476 unsigned int i;
1477 EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1478 if (!bitmap_bit_p (&outer_li.unity_names, i))
1479 dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1480 "hoisting check that %T == 1 to outer loop\n",
1481 ssa_name (i));
1484 bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1485 if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1486 outer_li.outermost = inner_li.outermost;
1489 /* Add LOOP to the queue of loops to version. */
1491 void
1492 loop_versioning::add_loop_to_queue (struct loop *loop)
1494 loop_info &li = get_loop_info (loop);
1496 if (dump_enabled_p ())
1497 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1498 "queuing this loop for versioning\n");
1499 m_loops_to_version.safe_push (loop);
1501 /* Don't try to version superloops. */
1502 li.rejected_p = true;
1505 /* Decide whether the cost model would allow us to version LOOP,
1506 either directly or as part of a parent loop, and return true if so.
1507 This does not imply that the loop is actually worth versioning in its
1508 own right, just that it would be valid to version it if something
1509 benefited.
1511 We have already made this decision for all inner loops of LOOP. */
1513 bool
1514 loop_versioning::decide_whether_loop_is_versionable (struct loop *loop)
1516 loop_info &li = get_loop_info (loop);
1518 if (li.rejected_p)
1519 return false;
1521 /* Examine the decisions made for inner loops. */
1522 for (struct loop *inner = loop->inner; inner; inner = inner->next)
1524 loop_info &inner_li = get_loop_info (inner);
1525 if (inner_li.rejected_p)
1527 if (dump_enabled_p ())
1528 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1529 "not versioning this loop because one of its"
1530 " inner loops should not be versioned\n");
1531 return false;
1534 if (inner_li.worth_versioning_p ())
1535 li.subloops_benefit_p = true;
1537 /* Accumulate the number of instructions from subloops that are not
1538 the innermost, or that don't benefit from versioning. Only the
1539 instructions from innermost loops that benefit from versioning
1540 should be weighed against loop-versioning-max-inner-insns;
1541 everything else should be weighed against
1542 loop-versioning-max-outer-insns. */
1543 if (!inner_li.worth_versioning_p () || inner->inner)
1545 if (dump_enabled_p ())
1546 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1547 "counting %d instructions from this loop"
1548 " against its parent loop\n", inner_li.num_insns);
1549 li.num_insns += inner_li.num_insns;
1553 /* Enforce the size limits. */
1554 if (li.worth_versioning_p ())
1556 unsigned int max_num_insns = max_insns_for_loop (loop);
1557 if (dump_enabled_p ())
1558 dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1559 "this loop has %d instructions, against"
1560 " a versioning limit of %d\n",
1561 li.num_insns, max_num_insns);
1562 if (li.num_insns > max_num_insns)
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION
1566 | MSG_PRIORITY_USER_FACING,
1567 find_loop_location (loop),
1568 "this loop is too big to version");
1569 return false;
1573 /* Hoist all version checks from subloops to this loop. */
1574 for (struct loop *subloop = loop->inner; subloop; subloop = subloop->next)
1575 merge_loop_info (loop, subloop);
1577 return true;
1580 /* Decide which loops to version and add them to the versioning queue.
1581 Return true if there are any loops to version. */
1583 bool
1584 loop_versioning::make_versioning_decisions ()
1586 AUTO_DUMP_SCOPE ("make_versioning_decisions",
1587 dump_user_location_t::from_function_decl (m_fn->decl));
1589 struct loop *loop;
1590 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1592 loop_info &linfo = get_loop_info (loop);
1593 if (decide_whether_loop_is_versionable (loop))
1595 /* Commit to versioning LOOP directly if we can't hoist the
1596 version checks any further. */
1597 if (linfo.worth_versioning_p ()
1598 && (loop_depth (loop) == 1 || linfo.outermost == loop))
1599 add_loop_to_queue (loop);
1601 else
1603 /* We can't version this loop, so individually version any
1604 subloops that would benefit and haven't been versioned yet. */
1605 linfo.rejected_p = true;
1606 for (struct loop *subloop = loop->inner; subloop;
1607 subloop = subloop->next)
1608 if (get_loop_info (subloop).worth_versioning_p ())
1609 add_loop_to_queue (subloop);
1613 return !m_loops_to_version.is_empty ();
1616 /* Attempt to implement loop versioning for LOOP, using the information
1617 cached in the associated loop_info. Return true on success. */
1619 bool
1620 loop_versioning::version_loop (struct loop *loop)
1622 loop_info &li = get_loop_info (loop);
1624 /* Build up a condition that selects the original loop instead of
1625 the simplified loop. */
1626 tree cond = boolean_false_node;
1627 bitmap_iterator bi;
1628 unsigned int i;
1629 EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1631 tree name = ssa_name (i);
1632 tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1633 build_one_cst (TREE_TYPE (name)));
1634 cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1637 /* Convert the condition into a suitable gcond. */
1638 gimple_seq stmts = NULL;
1639 cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1641 /* Version the loop. */
1642 initialize_original_copy_tables ();
1643 basic_block cond_bb;
1644 li.optimized_loop = loop_version (loop, cond, &cond_bb,
1645 profile_probability::unlikely (),
1646 profile_probability::likely (),
1647 profile_probability::unlikely (),
1648 profile_probability::likely (), true);
1649 free_original_copy_tables ();
1650 if (!li.optimized_loop)
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1654 "tried but failed to version this loop for when"
1655 " certain strides are 1\n");
1656 return false;
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1661 "versioned this loop for when certain strides are 1\n");
1663 /* Insert the statements that feed COND. */
1664 if (stmts)
1666 gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1667 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1670 return true;
1673 /* Attempt to version all loops in the versioning queue. */
1675 void
1676 loop_versioning::implement_versioning_decisions ()
1678 /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1679 user-facing at this point. */
1681 bool any_succeeded_p = false;
1682 struct loop *loop;
1683 unsigned int i;
1684 FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1685 if (version_loop (loop))
1686 any_succeeded_p = true;
1687 if (!any_succeeded_p)
1688 return;
1690 update_ssa (TODO_update_ssa);
1692 /* Simplify the new loop, which is used when COND is false. */
1693 FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1695 loop_info &linfo = get_loop_info (loop);
1696 if (linfo.optimized_loop)
1697 name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1701 /* Run the pass and return a set of TODO_* flags. */
1703 unsigned int
1704 loop_versioning::run ()
1706 gcc_assert (scev_initialized_p ());
1708 if (analyze_blocks ()
1709 && prune_conditions ()
1710 && make_versioning_decisions ())
1711 implement_versioning_decisions ();
1713 return 0;
1716 /* Loop versioning pass. */
1718 const pass_data pass_data_loop_versioning =
1720 GIMPLE_PASS, /* type */
1721 "lversion", /* name */
1722 OPTGROUP_LOOP, /* optinfo_flags */
1723 TV_LOOP_VERSIONING, /* tv_id */
1724 PROP_cfg, /* properties_required */
1725 0, /* properties_provided */
1726 0, /* properties_destroyed */
1727 0, /* todo_flags_start */
1728 0, /* todo_flags_finish */
1731 class pass_loop_versioning : public gimple_opt_pass
1733 public:
1734 pass_loop_versioning (gcc::context *ctxt)
1735 : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1738 /* opt_pass methods: */
1739 virtual bool gate (function *) { return flag_version_loops_for_strides; }
1740 virtual unsigned int execute (function *);
1743 unsigned int
1744 pass_loop_versioning::execute (function *fn)
1746 if (number_of_loops (fn) <= 1)
1747 return 0;
1749 return loop_versioning (fn).run ();
1752 } // anon namespace
1754 gimple_opt_pass *
1755 make_pass_loop_versioning (gcc::context *ctxt)
1757 return new pass_loop_versioning (ctxt);